library(readr)
library(tidyverse)
library(ggplot2)
library(stringr)
library(gridExtra)
library(ggpattern)
library(openxlsx)
library(ggpubr)
library(lme4)
library(ggsignif)

#### Taipei
# 17 subjects from Taipei including the younger group (4M,4F) ages from 20-35, while the older group (4M, 5F) from 50-65.

multmerge_n <- function(mypath){
  filenames <- list.files(mypath, pattern = "^N_")
  age <- c("O", "Y", "Y", "Y", 
           "O", "Y", "O", "Y", "O", "Y", 
           "Y", "O", "O", "O", "O", "Y", "O")
  gender <- c("M", "M", "F", "M", 
              "M", "M", "F" ,"F", "M", "F", 
              "F", "F", "F", "F", "F", "M", "M")
  
  datalist <- Map(function(x, y, z){
    f <- read_csv(x, locale = locale(encoding = "utf8"))

    f1 <- f %>%
      rename(order = ...1) %>%
      mutate(id = x, age = y, gender = z)
    return(f1)}, filenames, age, gender)
  Reduce(function(x,y) {bind_rows(x,y)}, datalist)
}

mycorpus_n <- multmerge_n("./")

nonchi_syll <- mycorpus_n$syll_c[str_detect(mycorpus_n$syll_c,"[a-zA-Z]")]
nonchi_word <- mycorpus_n$word_c[str_detect(mycorpus_n$word_c,"[a-zA-Z]")]

n_syll <- mycorpus_n %>%
  select(syll_e, syll_c, id, age, gender, order) %>%
  filter(!is.na(syll_e)) %>%
  filter(!syll_c %in% nonchi_syll) %>%
  mutate(area = "N")

n_word <- mycorpus_n %>%
  select(word_e, word_c, id, age, gender, order) %>%
  filter(!is.na(word_e)) %>%
  filter(!word_c %in% nonchi_word) %>%
  mutate(area = "N")

#### Kaohsiung
# 15 subjects from Kaohsiung, including the younger group (4M,4F) ages from 20-35, while the older group (4M, 3F) from 50-65.

multmerge_s <- function(mypath){
  filenames <- list.files(mypath, pattern = "^S_")
  age <- c("Y", "Y", "O", "Y", "Y", 
           "O", "Y", "O", "Y", "O", 
           "O", "Y", "O", "Y", "O")
  gender <- c("F", "F", "M", "M", "M",
              "F", "F", "F", "M", "M", 
              "F", "M", "M", "F", "M")
  
  datalist <- Map(function(x, y, z){
    f <- read_csv(x, locale = locale(encoding = "utf8"))

    f1 <- f %>%
      rename(order = ...1) %>%
      mutate(id = x, age = y, gender = z)
    return(f1)}, filenames, age, gender)
  Reduce(function(x,y) {bind_rows(x,y)}, datalist)
}

mycorpus_s <- multmerge_s("./")

nonchi_syll <- mycorpus_s$syll_c[str_detect(mycorpus_s$syll_c,"[a-zA-Z]")]
nonchi_word <- mycorpus_s$word_c[str_detect(mycorpus_s$word_c,"[a-zA-Z]")]

s_syll <- mycorpus_s %>%
  select(syll_e, syll_c, id, age, gender, order) %>%
  filter(!is.na(syll_e)) %>%
  filter(!syll_c %in% nonchi_syll) %>%
  mutate(area = "S")

s_word <- mycorpus_s %>%
  select(word_e, word_c, id, age, gender, order) %>%
  filter(!is.na(word_e)) %>%
  filter(!word_c %in% nonchi_word) %>%
  mutate(area = "S")

## compile all the syllables and words
syll <- bind_rows(n_syll, s_syll)
word <- bind_rows(n_word, s_word)

#Data pre-processing

n_ai <- read_csv("data - N2.csv", col_types = cols(gender = col_character(),
                                                        stress = col_factor(),
                                                        n = col_character()),
                 locale = locale(encoding = "utf8"))
s_ai <- read_csv("data - S2.csv", col_types = cols(gender = col_character(),
                                                        stress = col_factor(),
                                                        n = col_character()),
                 locale = locale(encoding = "utf8"))
data <- bind_rows(n_ai, s_ai)

len <- read_csv("df2.csv")
## Rows: 31 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Filename, area
## dbl (1): length
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data5 <- right_join(data, len, by = c("Filename", "area"))

df5 <- read_csv("df5.csv", col_types = cols(n = col_character()))
data6 <- right_join(data5, df5, by = c("Filename", "n"))

blocks_even <- read_csv("df4.csv", col_types = cols(n = col_character()))
data7 <- right_join(data6, blocks_even, by = c("Filename", "n"))

#Data pre-processing
excluded <- c("truncated", "laughter", "N", "noise", "overlap", "o", "error", "deleted", "esch", "sche", "scha", "ae")

data1 <- data7 %>%
  filter(dur <= 0.3) %>%
  filter(!realization %in% excluded) %>%
  mutate(realization = gsub("^ai$", "[aɪ]", realization)) %>%
  mutate(realization = gsub("^ei$", "[eɪ]", realization)) %>%
  mutate(realization = gsub("^e$", "[e]", realization)) %>%
  mutate(realization = gsub("^a$", "[a]", realization)) %>%
  mutate(realization = gsub("^sch$", "[É™]", realization)) %>%
  mutate(realization = gsub("^esch$", "[eÉ™]", realization)) %>%
  mutate(realization = gsub("^asch$", "[aÉ™]", realization)) %>%
  mutate(realization = gsub("^ae$", "[aÉ›]", realization)) %>%
  mutate(dm = ifelse(realization == "[aɪ]", "[aɪ]",
                     ifelse(realization %in% c("[eɪ]", "[aə]"), "D", "M"))) %>%
  mutate(wl = gsub("^u_", "", place)) %>%
  mutate(wl = gsub("_[a-z]+", "", wl)) %>%
  mutate(context = gsub("^u_", "", place)) %>%
  mutate(context = gsub("[a-z]+_[1-9]_", "", context)) %>%
  mutate(con = ifelse(is.na(`1st`), "Head", ifelse(is.na(`3rd`), "Tail", "Middle"))) %>%
  mutate(context = ifelse(context %in% c("zh", "ch", "sh", "r", "z", "c", "s", "i", 
                                         "y", "d", "t", "n", "l", "j", "q", "x"), "cor",
                           ifelse(context %in% c("b", "p", "m", "f", "w"), "lab",
                                  ifelse(context %in% c("g", "k", "h", "o", "u", "e",
                                                        "a"), "dor", "else")))) %>%
  mutate(onset = gsub("(ai|uai)[01234]", "", label)) %>%
  mutate(onset1 = ifelse(onset %in% c("zh", "ch", "sh", "r", "z", "c", "s", "i",
                                         "y", "d", "t", "n", "l", "j", "q", "x"), "cor",
                           ifelse(onset %in% c("b", "p", "m", "f", "w"), "lab",
                                  ifelse(onset %in% c("g", "k", "h", "o", "u", "e",
                                                        "a", ""), "dor", "else")))) %>%
  mutate(pos = gsub("[mbtqph]_1", "head", wl)) %>%
  mutate(pos = gsub("[tqph]_2", "central", pos)) %>%
  mutate(pos = gsub("[qph]_3", "central", pos)) %>%
  mutate(pos = gsub("[ph]_4", "central", pos)) %>%
  mutate(pos = gsub("h_3", "central", pos)) %>%
  mutate(pos = gsub("h_5", "central", pos)) %>%
  mutate(pos = gsub("b_2", "tail", pos)) %>%
  mutate(pos = gsub("t_3", "tail", pos)) %>%
  mutate(pos = gsub("q_4", "tail", pos)) %>%
  mutate(pos = gsub("p_5", "tail", pos)) %>%
  mutate(pos = gsub("h_6", "tail", pos))

###POS examination
toPOS <- data1 %>%
  distinct(word)
#write_csv(toPOS, "toPOS.csv")
pos <- read_csv("POS1.csv", locale = locale(encoding = "utf-8"))
## Rows: 831 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): POS, word_c
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
pos1 <- rename(pos, word = word_c)

C <- c("Caa", "Cbb")
ADV <- c("Da", "Dfa", "Dfb", "D", "Dk")
POST <- c("Cab", "Cba", "Neqb")
ASP <- c("Di")
N <- c("Na", "Nb", "Nc", "Nd", "Ncd", "Nh", "Ng")
DET <- c("Neu", "Nes", "Nep", "Neqa")
M <- c("Nf")
T1 <- c("I", "T", "DE")
Vi <- c("VA", "VB", "VH", "VI")
Vt <- c("VAC", "VC", "VCL", "VD", "VE", "VF", "VG", "VHC", "VJ", "VK", "VL", "SHI", "V_2")

for (i in seq(length(pos1$POS))) {
  if (pos1$POS[i] %in% C) {
    pos1$POS[i] <- "C"
  } else if (pos1$POS[i] %in% ADV) {
    pos1$POS[i] <- "ADV"
  } else if (pos1$POS[i] %in% POST) {
    pos1$POS[i] <- "else"
  } else if (pos1$POS[i] %in% ASP) {
    pos1$POS[i] <- "else"
  } else if (pos1$POS[i] %in% N) {
    pos1$POS[i] <- "N"
  } else if (pos1$POS[i] %in% DET) {
    pos1$POS[i] <- "DET"
  } else if (pos1$POS[i] %in% M) {
    pos1$POS[i] <- "M"
  } else if (pos1$POS[i] %in% T1) {
    pos1$POS[i] <- "T"
  } else if (pos1$POS[i] %in% Vi) {
    pos1$POS[i] <- "V"
  } else if (pos1$POS[i] %in% Vt) {
    pos1$POS[i] <- "V"
  } else if (pos1$POS[i] == "COMMACATEGORY") {
    pos1$POS[i] <- "else"
  } else if (pos1$POS[i] == "PERIODCATEGORY") {
    pos1$POS[i] <- "else"
  } 
}

pos2 <- pos1 %>%
  mutate(POS = ifelse(POS %in% c("N", "V", "ADV", "A"), "content", "function")) 

data2 <- right_join(data1, pos2, by = "word")

tidy <- read_csv("data - frequency.csv")
## Rows: 9957 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Word, Pinyin
## dbl (5): No, Rank, Frequency, Percent, Cumulation
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
tidy1 <- tidy %>%
  filter(Pinyin %in% Pinyin[str_detect(Pinyin, "ai")]) %>%
  mutate(No = seq(1:n()))

word_h <- tidy1 %>%
  filter(Cumulation <= 50)

word_l <- tidy1 %>%
  filter(Cumulation > 50)

data3 <- data2 %>%
  mutate(ag = paste0(area, age, gender)) %>%
  mutate(freq = ifelse(word %in% word_h$Word, "H", "L"))

data3$tone <- str_replace(data3$label, "[a-z]+", "")
data4 <- data3 %>%
  filter(tone != "0") %>%
  filter(stress != "0") 

foc <- data4 %>%
  group_by(Filename, word, freq) %>%
  filter(order == min(order)) %>%
  ungroup() %>%
  mutate(giv = "foc")

rest <- data4 %>%
  group_by(Filename, word, freq) %>%
  filter(order != min(order)) %>%
  ungroup() %>%
  mutate(giv = "soc")

data3 <- bind_rows(foc, rest)

##number of tokens across groups and realizations

data3 %>%
  group_by(realization) %>%
  count() %>%
  arrange(desc(n))
## # A tibble: 6 × 2
## # Groups:   realization [6]
##   realization     n
##   <chr>       <int>
## 1 [aɪ]         4677
## 2 [e]          2795
## 3 [É™]          1023
## 4 [a]           506
## 5 [eɪ]          242
## 6 [aÉ™]           74

##overall reduction rate

data3 %>%
  mutate(sum = length(Filename)) %>%
  mutate(dm = ifelse(realization == "[aɪ]", "[aɪ]",
                    ifelse(realization %in% c("[eɪ]", "[aə]"), "D", "M"))) %>%
  group_by(sum, dm) %>%
  reframe(n = n(), p = n/sum*100) %>%
  distinct()
## # A tibble: 3 × 4
##     sum dm        n     p
##   <int> <chr> <int> <dbl>
## 1  9317 D       316  3.39
## 2  9317 M      4324 46.4 
## 3  9317 [aɪ]   4677 50.2

Overall reduction

dm3 <- data3 %>%
  mutate(stress = "all") %>%
  mutate(order = ifelse(realization == "[aɪ]", 6, 
                        ifelse(realization == "[eɪ]", 5,
                               ifelse(realization == "[aÉ™]", 4,
                                      ifelse(realization == "[e]", 3,
                                             ifelse(realization == "[a]", 2, 1)))))) %>%
  group_by(realization, stress, order) %>%
  summarise(sum = 9326, n = n(), p = 100*n/sum) %>%
  ungroup() %>%
  select(-sum, -n) %>%
  mutate(realization = as.factor(realization)) %>%
  arrange(realization) %>%
  mutate(realization = fct_reorder(realization, order))
## `summarise()` has grouped output by 'realization', 'stress'. You can override
## using the `.groups` argument.
f_rea <- ggplot(dm3, aes(x= stress, y = p, fill = realization)) +
  geom_bar(stat = "identity", position="stack", width = .6, color = "black", alpha = .9,
           show.legend = FALSE) +
  labs(x = "", y = "Percentage of number", fill = "") +
  geom_text(aes(label = realization), size = 14, check_overlap = TRUE,
            position = position_stack(vjust = 0.5)
            ) +
  scale_y_continuous(expand = c(0,0), limit = c(0,100)) +
  theme_bw() +
  theme(plot.title = element_text(size = 26, hjust = 0.5, vjust = 0.5),
        axis.text.x = element_text(size = 26),
        strip.text.x = element_text(size = 26),
        legend.title = element_text(size = 26),
        legend.text = element_text(size = 26),
        legend.position = "top",
        plot.margin = margin(1, 1, 1, 1, "cm"),
        axis.title = element_text(size = 26),
        axis.text.y=element_blank(), 
        axis.ticks.y=element_blank(),
        panel.background=element_rect(colour="black", size = 1)) +
  coord_flip() +
  scale_fill_manual(
        values = c("#9575cd", "#7986cb", "#729fed","#ffdf45", "#ffb74d", "#f8766d"),
        limits = c("[a]", "[ə]","[e]", "[aə]", "[eɪ]", "[aɪ]"),
        breaks = c("[a]", "[ə]","[e]", "[aə]", "[eɪ]", "[aɪ]"),
        name = "Realization",
        labels = c("[a]", "[ə]","[e]", "[aə]", "[eɪ]", "[aɪ]"),
  )
## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
f_rea

#ggsave("f_rea.png", dpi = 600, width = 20, height = 5)

##POS reduction rate

s_r1 <- data3 %>%
  group_by(POS, stress) %>%
  mutate(count = n()) %>%
  ungroup() %>%
  mutate(order = ifelse(dm == "[aɪ]", 3,
                        ifelse(dm == "D", 2, 1))) %>%
  mutate(POS = ifelse(POS == "function", "Function", "Content")) %>%
  mutate(stress= ifelse(stress == 2, "S2", ifelse(stress == 3, "S3", "S1"))) %>%
  mutate(dm = ifelse(dm == "M", "Monophthongs", ifelse(dm == "D", "Diphthongs", "Retained [aɪ]"))) %>%
  mutate(stress = as.factor(stress)) %>%
  group_by(dm, stress, POS) %>% 
  reframe(num = n(), p = 100*num/count) %>% 
  ungroup() %>%
  distinct() %>%
  filter(dm == "Retained [aɪ]") 

f_dm1 <- ggplot(s_r1, aes(x= stress, y = p, fill = POS)) +
  geom_bar(stat = "identity", 
           position = position_dodge(preserve = "single"), 
           width = .7, color = "black") +
  labs(x = "Stress", 
       y = "Ratio of retained [aɪ]",
       fill = "") +
  geom_text(aes(label = round(p)), size = 5, check_overlap = FALSE,
             position = position_dodge(width = .7), vjust = 4) +
  scale_y_continuous(expand = c(0,0), limit = c(0,100)) +
  theme_bw() +
  scale_fill_manual(
        values = c("#f8766d", "#729fed"),
        limits = c("Content", "Function"),
        breaks = c("Content", "Function"),
        name = "Word Category",
        labels = c("Content", "Function")
  ) +
  theme(plot.title = element_text(size = 16, hjust = 0.5, vjust = 0.5),
        axis.text.x = element_text(size = 14),
        strip.text.x = element_text(size = 16),
        legend.title = element_text(size = 14),
        legend.text = element_text(size = 14),
        axis.title = element_text(size = 16),
        #plot.margin = margin(0, 1, 0, 0, "cm"),
        panel.background=element_rect(colour="black", size = 1),
        axis.text.y = element_text(size = 16)) 

f_dm1

#ggsave("f1_c.png", dpi = 600, width = 8, height = 5)

##freq reduction rate

s_r2 <- data3 %>%
  filter(POS != "function") %>%
  group_by(freq,
           giv,
           stress) %>%
  mutate(count = n()) %>%
  ungroup() %>%
  mutate(order = ifelse(dm == "[aɪ]", 3,
                        ifelse(dm == "D", 2, 1))) %>%
  filter(freq != "M") %>%
  mutate(freq = ifelse(freq == "H", "High", "Low")) %>%
  mutate(stress= ifelse(stress == 2, "S2", ifelse(stress == 3, "S3", "S1"))) %>%
  mutate(giv = ifelse(giv == "foc", "First", "Second")) %>%
  mutate(dm = ifelse(dm == "M", "Monophthongs", ifelse(dm == "D", "Diphthongs", "Retained [aɪ]"))) %>%
  mutate(type = paste(freq, giv)) %>%
  mutate(stress = as.factor(stress)) %>%
  group_by(dm, stress, freq, giv, type) %>% 
  reframe(num = n(), p = 100*num/count) %>% 
  ungroup() %>%
  distinct() %>%
  filter(dm == "Retained [aɪ]") 

f_dm2 <- ggplot(s_r2, aes(x= stress, y = p, fill = type)) +
  geom_bar(stat = "identity", 
           position = position_dodge(preserve = "single"), 
           width = .8, 
           color = "black") +
  labs(x = "Stress", 
       y = "Ratio of retained [aɪ]",
       fill = "") +
  geom_text(aes(label = round(p)), size = 5, check_overlap = FALSE,
             position = position_dodge(width = .8), vjust = 5) +
  scale_y_continuous(expand = c(0,0), limit = c(0,100)) +
  theme_bw() +
  scale_fill_manual(
        values = c("light blue", "#729fed", "pink", "#f8766d"),
        limits = c("High First", "High Second", "Low First", "Low Second"),
        breaks = c("High First", "High Second", "Low First", "Low Second"),
        name = "",
        labels = c("High First", "High Second", "Low First", "Low Second")
  ) +
  theme(plot.title = element_text(size = 16, hjust = 0.5, vjust = 0.5),
        axis.text.x = element_text(size = 16),
        strip.text.x = element_text(size = 16),
        legend.title = element_text(size = 14),
        legend.text = element_text(size = 14),
        axis.title = element_text(size = 16),
        #plot.margin = margin(0, 1, 0, 0, "cm"),
        panel.background=element_rect(colour="black", size = 1),
        axis.text.y = element_text(size = 16)) 

f_dm2

#Statistics

lr_f <- data3 %>%
  #filter(giv != "soc") %>%
  #filter(stress != 1) %>%
  #filter(area == "S") %>%
  #filter(gender == "F") %>%
  #filter(age == "Y") %>%
  #filter(POS != "function") %>%
  #filter(pos != "central") %>%
  #filter(freq != "M") %>%
  mutate(reduce = ifelse(realization == "[aɪ]", "1", "0")) %>%
  mutate(giv = as.factor(giv)) %>%
  mutate(freq = as.factor(freq)) %>%
  mutate(POS = as.factor(POS)) %>%
  mutate(pos = as.factor(pos)) %>%
  mutate(pos = as.factor(con)) %>%
  mutate(stress = as.factor(stress)) %>%
  mutate(gender = as.factor(gender)) %>%
  mutate(area = as.factor(area)) %>%
  mutate(age = as.factor(age)) %>%
  mutate(reduce = as.factor(reduce))

#model_r <- glmer(reduce~giv*stress*freq + (1|Filename), data = lr_f, family = binomial)
model_r <- glmer(reduce~POS*stress + age*gender*area + (1|Filename), data = lr_f, family = binomial)

summary(model_r)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: reduce ~ POS * stress + age * gender * area + (1 | Filename)
##    Data: lr_f
## 
##      AIC      BIC   logLik deviance df.resid 
##  12055.0  12154.9  -6013.5  12027.0     9303 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.6462 -0.9155  0.2743  0.8925  2.2536 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  Filename (Intercept) 0.08698  0.2949  
## Number of obs: 9317, groups:  Filename, 31
## 
## Fixed effects:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          0.45171    0.15975   2.828  0.00469 ** 
## POSfunction         -0.70174    0.05545 -12.657  < 2e-16 ***
## stress1             -1.05288    0.07595 -13.862  < 2e-16 ***
## stress3              1.81288    0.16890  10.733  < 2e-16 ***
## ageY                -0.27417    0.22410  -1.223  0.22118    
## genderM             -0.12992    0.22695  -0.572  0.56700    
## areaS               -0.14704    0.24865  -0.591  0.55428    
## POSfunction:stress1  0.32709    0.14774   2.214  0.02683 *  
## POSfunction:stress3  0.92445    0.55379   1.669  0.09506 .  
## ageY:genderM         0.01590    0.31857   0.050  0.96021    
## ageY:areaS           0.43517    0.33564   1.297  0.19479    
## genderM:areaS        0.13400    0.33704   0.398  0.69094    
## ageY:genderM:areaS  -0.37914    0.46375  -0.818  0.41361    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation matrix not shown by default, as p = 13 > 12.
## Use print(x, correlation=TRUE)  or
##     vcov(x)        if you need it

difference of difference

s_r <- foc %>%
  group_by(freq, POS, age, gender, area, stress) %>%
  count()
  
s_r1 <- foc %>%
  select(-n) %>%
  mutate(order = ifelse(dm == "[aɪ]", 3,
                        ifelse(dm == "D", 2, 1))) 


dm1 <- right_join(s_r, s_r1,  by = c("freq", "POS", "age", "gender", "area", "stress"))

dm <- dm1 %>%
  filter(freq != "M") %>%
  mutate(stress= ifelse(stress == 2, "S2", ifelse(stress == 3, "S3", "S1"))) %>%
  mutate(dm = ifelse(dm == "M", "Monophthongs", ifelse(dm == "D", "Diphthongs", "Retained [aɪ]"))) %>%
  mutate(stress = as.factor(stress)) %>%
  group_by(dm, stress, freq, POS, area, gender, age, order) %>% 
  reframe(num = n(), p = 100*num/n) %>% 
  ungroup() %>%
  distinct() %>%
  select(-num) %>%
  mutate(dm = as.factor(dm)) %>%
  arrange(dm) %>%
  mutate(dm = fct_reorder(dm, order))

dm2 <- dm %>%
  filter(dm == "Retained [aɪ]") %>%
  spread(stress, p) %>%
  mutate(df12 = S2-S1, df23 = S3-S2)

lr_f <- dm2 %>%
  filter(POS != "function") %>%
  mutate(POS = as.factor(POS)) %>%
  mutate(gender = as.factor(gender)) %>%
  mutate(area = as.factor(area)) %>%
  mutate(age = as.factor(age)) 

#model_r <- aov(dff~age*gender*area + Error(1/stress), data = lr_f)
model_r <- aov(df23~age*gender*area + Error(1/freq), data = lr_f)
summary(model_r)
## 
## Error: Within
##                 Df Sum Sq Mean Sq F value Pr(>F)
## age              1   15.4    15.4   0.049  0.831
## gender           1  151.1   151.1   0.480  0.511
## area             1   74.2    74.2   0.235  0.642
## age:gender       1    8.7     8.7   0.028  0.872
## age:area         1  399.5   399.5   1.268  0.297
## gender:area      1   90.1    90.1   0.286  0.609
## age:gender:area  1  384.0   384.0   1.219  0.306
## Residuals        7 2204.8   315.0

ai to schwa ratio POS

s_r3 <- data3 %>%
  filter(dm == "M") %>%
  filter(stress != 3) %>%
  filter(stress == 1) %>%
  filter(POS == "content") %>%
  group_by(age, gender) %>%
  mutate(count = n()) %>%
  ungroup() %>%
  mutate(order2 = ifelse(realization == "[e]", 3,
                        ifelse(realization == "[a]", 2, 1))) %>%
  mutate(POS = ifelse(POS == "function", "Function word", "Content word")) %>%
  mutate(stress= ifelse(stress == 2, "S2", ifelse(stress == 3, "S3", "S1"))) %>%
  mutate(area = ifelse(area == "N", "North", "South")) %>%
  mutate(gender = ifelse(gender == "F", "Female", "Male")) %>%
  mutate(age = ifelse(age == "O", "Old", "Young")) %>%
  mutate(stress = as.factor(stress)) %>%
  group_by(realization, age, gender,
           order2) %>% 
  reframe(num = n(), p = 100*num/count) %>% 
  ungroup() %>%
  distinct() %>%
  mutate(realization = as.factor(realization)) %>%
  arrange(realization) %>%
  mutate(realization = fct_reorder(realization, order2))

f_dm <- ggplot(s_r3, aes(x= gender, y = p, fill = realization)) +
  geom_bar(stat = "identity", 
           width = .5, color = "black") +
  labs(x = "Gender", 
       y = "Ratio of monophthongized /aɪ/",
       fill = "") +
  geom_text(aes(label = round(p)), size = 6, check_overlap = TRUE,
            position = position_stack(vjust = 0.5)
            ) + 
  scale_y_continuous(expand = c(0,0), limit = c(0,100.2)) +
  theme_bw() +
  theme(plot.title = element_text(size = 16, hjust = 0.5, vjust = 0.5),
        axis.text.x = element_text(size = 12),
        strip.text.x = element_text(size = 16),
        strip.text.y = element_text(size = 16),
        legend.title = element_text(size = 16),
        legend.text = element_text(size = 14),
        axis.title = element_text(size = 16),
        plot.margin = margin(0, 1, 0, 0, "cm"),
        panel.background=element_rect(colour="black", size = 1),
        axis.text.y = element_text(size = 16)) +
  facet_grid(~age)

f_dm

#ggsave("f1mo_ag.png", dpi = 600, width = 8, height = 4)

ai to schwa ratio frequency

s_r4 <- data3 %>%
  filter(giv == "soc") %>%
  filter(stress != 3) %>%
  filter(dm == "M") %>%
  filter(POS == "content") %>%
  group_by(freq, stress) %>%
  mutate(count = n()) %>%
  ungroup() %>%
  mutate(order2 = ifelse(realization == "[e]", 3,
                        ifelse(realization == "[a]", 2, 1))) %>%
  mutate(POS = ifelse(POS == "function", "Function word", "Content word")) %>%
  mutate(age = ifelse(age == "O", "Old", "Young")) %>%
  mutate(area = ifelse(area == "N", "North", "South")) %>%
  mutate(freq = ifelse(freq == "H", "High", "Low")) %>%
  mutate(stress= ifelse(stress == 2, "S2", ifelse(stress == 3, "S3", "S1"))) %>%
  mutate(stress = as.factor(stress)) %>%
  group_by(realization, order2, stress, freq) %>% 
  reframe(num = n(), p = 100*num/count) %>% 
  ungroup() %>%
  distinct() %>%
  mutate(realization = as.factor(realization)) %>%
  arrange(realization) %>%
  mutate(realization = fct_reorder(realization, order2))

f_dm4 <- ggplot(s_r4, aes(x= stress, y = p, fill = realization)) +
  geom_bar(stat = "identity", 
           width = .5, color = "black") +
  labs(x = "Stress", 
       y = "Ratio of monophthongized /aɪ/",
       fill = "") +
  geom_text(aes(label = round(p)), size = 6, check_overlap = TRUE,
            position = position_stack(vjust = 0.5)
            ) + 
  scale_y_continuous(expand = c(0,0), limit = c(0,100.2)) +
  theme_bw() +
  theme(plot.title = element_text(size = 16, hjust = 0.5, vjust = 0.5),
        axis.text.x = element_text(size = 12),
        strip.text.x = element_text(size = 16),
        strip.text.y = element_text(size = 16),
        legend.title = element_text(size = 16),
        legend.text = element_text(size = 14),
        axis.title = element_text(size = 16),
        plot.margin = margin(0, 1, 0, 0, "cm"),
        panel.background=element_rect(colour="black", size = 1),
        axis.text.y = element_text(size = 16)) +
  facet_grid(~freq)

f_dm4

#ggsave("f1mo_old_freq.png", dpi = 600, width = 8, height = 4)

monophthongs stats

chi-square

library(RVAideMemoire)
## Warning: package 'RVAideMemoire' was built under R version 4.3.1
## *** Package RVAideMemoire v 0.9-83-7 ***
## 
## Attaching package: 'RVAideMemoire'
## The following object is masked from 'package:lme4':
## 
##     dummy
chi <- data3 %>%
  filter(dm == "M") %>%
  filter(POS == "function") %>%
  filter(stress != 3) %>%
  mutate(stress = ifelse(stress == "2", "S2", "S1")) 

# Stress in cont and func
table1 <- table(chi$realization, chi$stress)
chisq.test(table1, correct=F)
## 
##  Pearson's Chi-squared test
## 
## data:  table1
## X-squared = 16.476, df = 2, p-value = 0.0002644
chisq.multcomp(table1, p.method = "holm")
## 
##  Pairwise comparisons using chi-squared tests 
## 
## data:  table1 
## 
##     14      58      98      179     228    
## 58  8.6e-07 -       -       -       -      
## 98  1.4e-14 0.0027  -       -       -      
## 179 < 2e-16 2.3e-14 3.4e-06 -       -      
## 228 < 2e-16 < 2e-16 3.0e-12 0.0151  -      
## 747 < 2e-16 < 2e-16 < 2e-16 < 2e-16 < 2e-16
## 
## P value adjustment method: holm
table1
##      
##        S1  S2
##   [a]  14  58
##   [e] 179 747
##   [É™]  98 228
#post hoc residuals
chisq.test(table1, correct=F)$stdres
##      
##               S1         S2
##   [a] -0.5340425  0.5340425
##   [e] -3.5496260  3.5496260
##   [É™]  4.0590504 -4.0590504
sig <- .05
sigadj <- sig/(nrow(table1)*ncol(table1))
sigadj
## [1] 0.008333333
qnorm(sigadj/2)
## [1] -2.638257
chi2 <- data3 %>%
  filter(dm == "M") %>%
  filter(stress == "2") 

# POS in S2 and S1
table2 <- table(chi2$realization, chi2$POS)
chisq.test(table2, correct=F)
## 
##  Pearson's Chi-squared test
## 
## data:  table2
## X-squared = 42.287, df = 2, p-value = 6.568e-10
chisq.multcomp(table2, p.method = "holm")
## 
##  Pairwise comparisons using chi-squared tests 
## 
## data:  table2 
## 
##      58      228     302     460     747    
## 228  < 2e-16 -       -       -       -      
## 302  < 2e-16 0.0013  -       -       -      
## 460  < 2e-16 < 2e-16 2.1e-08 -       -      
## 747  < 2e-16 < 2e-16 < 2e-16 4.3e-16 -      
## 1528 < 2e-16 < 2e-16 < 2e-16 < 2e-16 < 2e-16
## 
## P value adjustment method: holm
table2
##      
##       content function
##   [a]     302       58
##   [e]    1528      747
##   [É™]     460      228
#post hoc residuals
chisq.test(table2, correct=F)$stdres
##      
##         content  function
##   [a]  6.501107 -6.501107
##   [e] -3.209054  3.209054
##   [É™] -1.306640  1.306640
sig <- .05
sigadj <- sig/(nrow(table2)*ncol(table2))
sigadj
## [1] 0.008333333
qnorm(sigadj/2)
## [1] -2.638257
chi3 <- data3 %>%
  filter(dm == "M") %>%
  filter(giv == "soc") %>%
  #filter(freq != "L") %>%
  filter(stress != 3) %>%
  filter(stress == 2) %>%
  mutate(stress = ifelse(stress == "2", "S2", "S1")) %>%
  filter(POS == "content")

#freq in S1, not in S2
#stress low: [e], high: all
#gender/age in S1: young more [a], male more [e], fewer schwa
#area in S1 low: south more schwa, fewer [e]
table3 <- table(chi3$realization, chi3$freq)
chisq.test(table3, correct=F)
## 
##  Pearson's Chi-squared test
## 
## data:  table3
## X-squared = 1.4724, df = 2, p-value = 0.4789
chisq.multcomp(table3, p.method = "holm")
## 
##  Pairwise comparisons using chi-squared tests 
## 
## data:  table3 
## 
##     73      129     154     225     381    
## 129 0.00024 -       -       -       -      
## 154 3.8e-07 0.13725 -       -       -      
## 225 < 2e-16 1.3e-06 0.00053 -       -      
## 381 < 2e-16 < 2e-16 < 2e-16 1.4e-09 -      
## 761 < 2e-16 < 2e-16 < 2e-16 < 2e-16 < 2e-16
## 
## P value adjustment method: holm
table3
##      
##         H   L
##   [a] 154  73
##   [e] 761 381
##   [É™] 225 129
#post hoc residuals
chisq.test(table3, correct=F)$stdres
##      
##                H          L
##   [a]  0.5733416 -0.5733416
##   [e]  0.5827610 -0.5827610
##   [É™] -1.1618225  1.1618225
sig <- .05
sigadj <- sig/(nrow(table3)*ncol(table3))
sigadj
## [1] 0.008333333
qnorm(sigadj/2)
## [1] -2.638257
chi2 <- data3 %>%
  filter(dm == "M") %>%
  filter(stress == "2") %>%
  filter(POS == "function")

# gender in S2 func
table2 <- table(chi2$realization, chi2$gender)
chisq.test(table2, correct=F)
## 
##  Pearson's Chi-squared test
## 
## data:  table2
## X-squared = 9.8681, df = 2, p-value = 0.007197

[e] and [a]

chi <- data3 %>%
  filter(dm == "M") %>%
  filter(realization != "[É™]") %>%
  filter(POS == "function") %>%
  filter(stress != 3) %>%
  mutate(stress = ifelse(stress == "2", "S2", "S1")) 

# Stress in cont, not func
table1 <- table(chi$realization, chi$stress)
chisq.test(table1, correct=F)
## 
##  Pearson's Chi-squared test
## 
## data:  table1
## X-squared = 0.00055649, df = 1, p-value = 0.9812
chi2 <- data3 %>%
  filter(dm == "M") %>%
  filter(realization != "[É™]") %>%
  filter(stress == "2") 

# POS in S2 and S1: [a]
table2 <- table(chi2$realization, chi2$POS)
chisq.test(table2, correct=F)
## 
##  Pearson's Chi-squared test
## 
## data:  table2
## X-squared = 40.973, df = 1, p-value = 1.543e-10
chi3 <- data3 %>%
  filter(dm == "M") %>%
  filter(realization != "[É™]") %>%
  filter(giv == "soc") %>%
  filter(POS == "content") %>%
  filter(stress == "1") 

#freq in S1, not in S2
#gender/age in S1
table3 <- table(chi3$realization, chi3$area)
chisq.test(table3, correct=F)
## 
##  Pearson's Chi-squared test
## 
## data:  table3
## X-squared = 2.5061, df = 1, p-value = 0.1134

reduction rate cross time

re <- data3 %>%
  mutate(block1 = as.factor(block1)) %>%
  mutate(block_all = as.factor(block_all)) %>%
  mutate(reduction = ifelse(realization == "[aɪ]", 1, 0)) %>%
  group_by(area, age, gender, block_all,
           #block1, 
           ag, Filename) %>%
  summarise(mean = mean(reduction)*100) %>%
  ungroup() %>%
  group_by(area, age, gender, block_all,
           #block1, 
           ag) %>%
  summarise(n = n(), m = mean(mean), sd = sd(mean),
            se = sd/sqrt(n), max = m+se
            ) 
## `summarise()` has grouped output by 'area', 'age', 'gender', 'block_all', 'ag'.
## You can override using the `.groups` argument.
## `summarise()` has grouped output by 'area', 'age', 'gender', 'block_all'. You
## can override using the `.groups` argument.
f_re <- ggplot(re, aes(x= ag, y = m, fill = block_all)) +
  geom_bar(stat = "identity", 
           position = position_dodge(preserve = "single"), 
           width = .7, 
           color = "black") +
  #labs(x = "Realization", y = "Duration (ms)") +
  geom_errorbar(aes(ymin = m, ymax = max), 
                position = position_dodge(width = .7, preserve = "single"), 
                width = .6) +
  theme_bw() +
  scale_y_continuous(expand = c(0,0), limit = c(0, 100))
f_re

# stats1

data_re <- data3 %>%
  select(area, gender, age, realization, block_all, Filename) %>%
  mutate(area = ifelse(area == "S", "1", "0")) %>%
  mutate(gender = ifelse(gender == "F", "1", "0")) %>%
  mutate(age = ifelse(age == "Y", "1", "0")) %>%
  mutate(reduce = ifelse(realization == "[aɪ]", "1", "0")) %>%
  mutate(reduce = as.factor(reduce)) %>%
  mutate(block_all = as.factor(block_all))
  
model_re <- glmer(reduce~block_all*age*gender*area + (1|Filename), data = data_re, family = binomial)
summary(model_re)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: reduce ~ block_all * age * gender * area + (1 | Filename)
##    Data: data_re
## 
##      AIC      BIC   logLik deviance df.resid 
##  12729.2  12907.7  -6339.6  12679.2     9292 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.6188 -0.9531  0.6178  0.9662  1.4018 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  Filename (Intercept) 0.07675  0.277   
## Number of obs: 9317, groups:  Filename, 31
## 
## Fixed effects:
##                               Estimate Std. Error z value Pr(>|z|)  
## (Intercept)                    0.26783    0.18740   1.429   0.1530  
## block_all2                    -0.16502    0.15219  -1.084   0.2782  
## block_all3                    -0.09582    0.18365  -0.522   0.6018  
## age1                          -0.49887    0.26057  -1.915   0.0556 .
## gender1                        0.32408    0.26182   1.238   0.2158  
## area1                         -0.07399    0.26191  -0.283   0.7775  
## block_all2:age1                0.04716    0.20716   0.228   0.8199  
## block_all3:age1                0.19014    0.24122   0.788   0.4306  
## block_all2:gender1            -0.06872    0.20795  -0.330   0.7411  
## block_all3:gender1            -0.44025    0.24688  -1.783   0.0745 .
## age1:gender1                   0.12015    0.36494   0.329   0.7420  
## block_all2:area1               0.12339    0.21162   0.583   0.5598  
## block_all3:area1              -0.04357    0.24623  -0.177   0.8595  
## age1:area1                     0.21204    0.36570   0.580   0.5620  
## gender1:area1                 -0.46709    0.39643  -1.178   0.2387  
## block_all2:age1:gender1       -0.15009    0.28437  -0.528   0.5976  
## block_all3:age1:gender1       -0.04881    0.33020  -0.148   0.8825  
## block_all2:age1:area1         -0.25877    0.29183  -0.887   0.3752  
## block_all3:age1:area1         -0.18459    0.33203  -0.556   0.5783  
## block_all2:gender1:area1       0.33979    0.32357   1.050   0.2937  
## block_all3:gender1:area1       0.16880    0.38126   0.443   0.6580  
## age1:gender1:area1             0.11665    0.53773   0.217   0.8283  
## block_all2:age1:gender1:area1  0.28099    0.42990   0.654   0.5134  
## block_all3:age1:gender1:area1  0.48683    0.49828   0.977   0.3286  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation matrix not shown by default, as p = 24 > 12.
## Use print(x, correlation=TRUE)  or
##     vcov(x)        if you need it

first 30% and last 30%

re1 <- data3 %>%
  filter(block_30 != "m") %>%
  mutate(block_30 = ifelse(block_30 == "f", "First", "Last")) %>%
  mutate(gender = ifelse(gender == "M", "Male", "Female")) %>%
  mutate(age = ifelse(age == "O", "Old", "Young")) %>%
  mutate(reduction = ifelse(realization == "[aɪ]", 1, 0)) %>%
  group_by(age, 
           gender, 
           block_30, Filename) %>%
  summarise(mean = mean(reduction)*100) %>%
  ungroup() %>%
  group_by(age, 
           gender, 
           block_30) %>%
  summarise(n = n(), m = mean(mean), sd = sd(mean),
            se = sd/sqrt(n), max = m+se) %>%
  ungroup() %>%
  mutate(annotation = ifelse(gender == "Females", "***", "N.S."))
## `summarise()` has grouped output by 'age', 'gender', 'block_30'. You can
## override using the `.groups` argument.
## `summarise()` has grouped output by 'age', 'gender'. You can override using the
## `.groups` argument.
f_re1 <- ggplot(re1, aes(x= age, y = m, fill = block_30)) +
  geom_bar(stat = "identity", 
           position = position_dodge(preserve = "single"), 
           width = .7, 
           color = "black") +
  labs(x = "Age", y = "Ratio of retained [aɪ]", fill = "Block") +
  geom_errorbar(aes(ymin = m, ymax = max), 
                position = position_dodge(width = .7, preserve = "single"), 
                width = .5) +
  facet_grid(~gender) +
  theme_bw() +
  scale_y_continuous(expand = c(0,0), limit = c(0, 100)) +
  theme(plot.title = element_text(size = 18, hjust = 0.5, vjust = 0.5),
        axis.text.x = element_text(size = 18),
        strip.text.x = element_text(size = 18),
        legend.title = element_text(size = 12),
        legend.text = element_text(size = 12),
        legend.key.size = unit(0.3, 'cm'),
        axis.title = element_text(size = 18),
        #legend.position = c(0.9, 0.7),
        panel.background=element_rect(colour="black", size = 1),
        legend.background = element_rect(fill="white",
                                  size=0.5, linetype="solid", 
                                  colour ="black"),
        strip.background = element_rect(color = "black", size = 0.8),
        axis.text.y = element_text(size = 16)) +
  geom_signif(comparisons=list(c("Old", "Young")), annotations="*",
              y_position = 90, tip_length = 0.1, vjust=0) 

f_re1

#ggsave("f_block.png", dpi = 600, width = 7, height = 4)

stats1

data_re1 <- data3 %>%
  select(area, gender, age, realization, dm, block_30, Filename) %>%
  filter(block_30 != "m") %>%
  #filter(block_30 == "l") %>%
  #filter(gender == "F") %>%
  #filter(area == "S") %>%
  #filter(age == "O") %>%
  mutate(block_30 = as.factor(block_30)) %>%
  mutate(area = ifelse(area == "S", "1", "0")) %>%
  mutate(gender = ifelse(gender == "F", "1", "0")) %>%
  mutate(age = ifelse(age == "Y", "1", "0")) %>%
  mutate(reduce = ifelse(realization == "[aɪ]", "1", "0")) %>%
  #mutate(reduce = ifelse(dm == "M", "1", "0")) %>%
  #mutate(stress = as.factor(stress)) %>%
  mutate(reduce = as.factor(reduce))
  
model_re1 <- glmer(reduce~block_30*age*area*gender + (1|Filename), data = data_re1, family = binomial)
summary(model_re1)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: reduce ~ block_30 * age * area * gender + (1 | Filename)
##    Data: data_re1
## 
##      AIC      BIC   logLik deviance df.resid 
##   7668.5   7781.2  -3817.3   7634.5     5572 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.5486 -0.9667  0.6457  0.9721  1.3293 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  Filename (Intercept) 0.05973  0.2444  
## Number of obs: 5589, groups:  Filename, 31
## 
## Fixed effects:
##                               Estimate Std. Error z value Pr(>|z|)  
## (Intercept)                   0.227752   0.169654   1.342   0.1795  
## block_30l                     0.001884   0.166988   0.011   0.9910  
## age1                         -0.540987   0.234963  -2.302   0.0213 *
## area1                        -0.094824   0.235304  -0.403   0.6870  
## gender1                       0.395732   0.235349   1.681   0.0927 .
## block_30l:age1                0.221749   0.219812   1.009   0.3131  
## block_30l:area1              -0.057760   0.225090  -0.257   0.7975  
## age1:area1                    0.280087   0.327728   0.855   0.3928  
## block_30l:gender1            -0.530990   0.222831  -2.383   0.0172 *
## age1:gender1                  0.055426   0.326556   0.170   0.8652  
## area1:gender1                -0.399094   0.354750  -1.125   0.2606  
## block_30l:age1:area1         -0.290150   0.302721  -0.958   0.3378  
## block_30l:age1:gender1       -0.065879   0.299509  -0.220   0.8259  
## block_30l:area1:gender1       0.191273   0.343241   0.557   0.5774  
## age1:area1:gender1            0.035786   0.480069   0.075   0.9406  
## block_30l:age1:area1:gender1  0.615402   0.450333   1.367   0.1718  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation matrix not shown by default, as p = 16 > 12.
## Use print(x, correlation=TRUE)  or
##     vcov(x)        if you need it
coefs <- data.frame(coef(summary(model_re1)))
coefs <- coefs %>%
  mutate(Estimate = round(Estimate, 3)) %>%
  mutate(Std..Error = round(Std..Error, 3)) %>%
  mutate(z.value = round(z.value, 3)) %>%
  mutate(Pr...z.. = round(Pr...z.., 4))

#write.csv(coefs, "table4.csv")

#realization and duration POS

sr4 <- data3 %>%
  filter(dm != "D") %>%
  group_by(gender, age, area, stress, realization, Filename, POS) %>%
  mutate(tokens = n()) %>%
  ungroup() %>%
  mutate(order2 = ifelse(stress == "1", 1,
                        ifelse(stress == "2", 2,
                               ifelse(stress == "3", 3,
                                      0)))) %>%
  mutate(realization = ifelse(realization == "[aɪ]", "Retained /aɪ/", "Reduced /aɪ/")) %>%
  mutate(order1 = ifelse(realization == "Retained /aɪ/", 1, 2)) %>%
  mutate(ag = paste0(area, age, gender)) %>%
  group_by(realization, stress, order2, order1, Filename, POS) %>%
  summarise(m = 1000*mean(dur)) %>%
  ungroup() %>%
  group_by(realization, stress, order2, POS, order1
           #ag
           ) %>%
  summarise(n = n(), mean = mean(m), se = sd(m)/sqrt(n), 
            max = mean+se, min = mean-se) %>%
  ungroup() %>%
  distinct() %>%
  mutate(realization = as.factor(realization)) %>%
  arrange(realization) %>%
  mutate(realization = fct_reorder(realization, order1)) %>%
  mutate(stress = as.factor(stress)) %>%
  arrange(stress) %>%
  mutate(stress = fct_reorder(stress, order2))
## `summarise()` has grouped output by 'realization', 'stress', 'order2',
## 'order1', 'Filename'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'realization', 'stress', 'order2', 'POS'.
## You can override using the `.groups` argument.
f1_pos <- ggplot(sr4, aes(x= POS, y = mean, fill = stress)) +
  geom_bar(stat = "identity", 
           position = position_dodge(preserve = "single"), 
           width = .7, 
           #fill = "gray", 
           color = "black") +
  labs(x = "Word category", y = "Duration (ms)", fill = "Stress") +
      # , subtitle = "n ≥ 10") +
  geom_errorbar(aes(ymin = mean, ymax = max), 
                position = position_dodge(width = .7, preserve = "single"), 
                width = .4) +
  theme_bw() +
  facet_grid(~realization) +
  scale_y_continuous(expand = c(0,0), limit = c(0, 200)) +
  theme(plot.title = element_text(size = 18, hjust = 0.5, vjust = 0.5),
        axis.text.x = element_text(size = 18),
        strip.text.x = element_text(size = 18),
        legend.title = element_text(size = 18),
        legend.text = element_text(size = 14),
        axis.title = element_text(size = 18),
        panel.background=element_rect(colour="black", size = 1),
        axis.text.y = element_text(size = 16)) +
      scale_fill_manual(
        values = c("#e63946", "#329cff", "#ffbf46"),
        #values = c("white", "dark gray", "black"),
        limits = c("1", "2", "3"),
        breaks = c("1", "2", "3"),
        labels = c("S1", "S2", "S3")
  )

f1_pos

#ggsave("f2_c.png", dpi = 600, width = 10, height = 5)

function word in S3

sr6 <- data3 %>%
  filter(stress == 3) %>%
  filter(POS == "function") %>%
  filter(realization == "[aɪ]") %>%
  group_by(word) %>%
  summarise(tokens = n(), m = 1000*mean(dur)) %>%
  ungroup() %>%
  filter(tokens > 2)

f1_word <- ggplot(sr6, aes(x= word, y = m)) +
  geom_bar(stat = "identity", 
           width = .7, 
           color = "black") +
  labs(x = "Word", y = "Duration (ms)") +
  theme_bw() +
  scale_y_continuous(expand = c(0,0), limit = c(0, 250)) +
  theme(plot.title = element_text(size = 18, hjust = 0.5, vjust = 0.5),
        axis.text.x = element_text(size = 18, family = "Songti TC"),
        strip.text.x = element_text(size = 18),
        legend.title = element_text(size = 18),
        legend.text = element_text(size = 14),
        axis.title = element_text(size = 18),
        panel.background=element_rect(colour="black", size = 1),
        axis.text.y = element_text(size = 16)) 

f1_word

#realization and duration freq

sr2 <-data3 %>%
  filter(POS != "function") %>%
  #filter(dm != "D") %>%
  group_by(gender, age, area, stress, realization, Filename, freq, giv) %>%
  mutate(tokens = n()) %>%
  ungroup() %>%
  #mutate(check = ifelse(tokens < 10, "ignore", "count")) %>%
  mutate(order1 = ifelse(realization == "[aɪ]", 6,
                        ifelse(realization == "[eɪ]", 5,
                              ifelse(realization == "[a]", 3,
                                    ifelse(realization == "[e]", 4,
                                        ifelse(realization == "[É™]", 2,
                                             1)))))) %>%
  mutate(order2 = ifelse(stress == "1", 1,
                        ifelse(stress == "2", 2,
                               ifelse(stress == "3", 3,
                                      0)))) %>%
  mutate(realization = ifelse(realization == "[aɪ]", "Retained /aɪ/", "Reduced /aɪ/")) %>%
  mutate(order1 = ifelse(realization == "Retained /aɪ/", 1, 2)) %>%
  mutate(freq = ifelse(freq == "H", "High", "Low")) %>%
  mutate(giv = ifelse(giv == "foc", "First mention", "Second mention")) %>%
  mutate(ag = paste0(area, age, gender)) %>%
  group_by(realization, stress, order2, Filename, freq, giv, order1) %>%
  summarise(m = 1000*mean(dur)) %>%
  ungroup() %>%
  group_by(realization, stress, order2, freq, giv, order1) %>%
  summarise(n = n(), mean = mean(m), se = sd(m)/sqrt(n), 
            max = mean+se, min = mean-se) %>%
  ungroup() %>%
  #filter(realization != "reduced") %>%
  distinct() %>%
  mutate(realization = as.factor(realization)) %>%
  arrange(realization) %>%
  mutate(realization = fct_reorder(realization, order1)) %>%
  mutate(stress = as.factor(stress)) %>%
  arrange(stress) %>%
  mutate(stress = fct_reorder(stress, order2))
## `summarise()` has grouped output by 'realization', 'stress', 'order2',
## 'Filename', 'freq', 'giv'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'realization', 'stress', 'order2', 'freq',
## 'giv'. You can override using the `.groups` argument.
f1_freq <- ggplot(sr2, aes(x= freq, y = mean, fill = stress)) +
  geom_bar(stat = "identity", 
           position = position_dodge(preserve = "single"), 
           width = .7, 
           #fill = "gray", 
           color = "black") +
  labs(x = "Word frequency", y = "Duration (ms)", fill = "Stress") +
      # , subtitle = "n ≥ 10") +
  geom_errorbar(aes(ymin = mean, ymax = max), 
                position = position_dodge(width = .7, preserve = "single"), 
                width = .4) +
  theme_bw() +
  facet_grid(realization~giv) +
  scale_y_continuous(expand = c(0,0), limit = c(0, 200)) +
  theme(plot.title = element_text(size = 16, hjust = 0.5, vjust = 0.5),
        axis.text.x = element_text(size = 16),
        strip.text.x = element_text(size = 16),
        strip.text.y = element_text(size = 16),
        legend.title = element_text(size = 16),
        legend.text = element_text(size = 14),
        axis.title = element_text(size = 16),
        panel.background=element_rect(colour="black", size = 1),
        axis.text.y = element_text(size = 16)) +
      scale_fill_manual(
        values = c("#e63946", "#329cff", "#ffbf46"),
        limits = c("1", "2", "3"),
        breaks = c("1", "2", "3"),
        labels = c("S1", "S2", "S3")
  )

f1_freq

#ggsave("f2freq.png", dpi = 600, width = 10, height = 7.5)

sociolinguisitc factors

sr5 <- data3 %>%
  filter(POS == "content") %>%
  group_by(gender, age, area, stress, realization, Filename) %>%
  mutate(tokens = n()) %>%
  ungroup() %>%
  mutate(order1 = ifelse(realization == "[aɪ]", 6,
                        ifelse(realization == "[eɪ]", 5,
                              ifelse(realization == "[a]", 3,
                                    ifelse(realization == "[e]", 4,
                                        ifelse(realization == "[É™]", 2,
                                             1)))))) %>%
  mutate(order2 = ifelse(stress == "1", 1,
                        ifelse(stress == "2", 2,
                               ifelse(stress == "3", 3,
                                      0)))) %>%
  mutate(realization = ifelse(realization == "[aɪ]", "[aɪ]", "reduced")) %>%
  mutate(ag = paste0(area, age, gender)) %>%
  group_by(realization, stress, order2, Filename, age, area) %>%
  summarise(m = 1000*mean(dur)) %>%
  ungroup() %>%
  group_by(realization, stress, order2, age, area
           #ag
           ) %>%
  summarise(n = n(), mean = mean(m), se = sd(m)/sqrt(n), 
            max = mean+se, min = mean-se) %>%
  ungroup() %>%
  distinct() %>%
  mutate(stress = as.factor(stress)) %>%
  arrange(stress) %>%
  mutate(stress = fct_reorder(stress, order2))
## `summarise()` has grouped output by 'realization', 'stress', 'order2',
## 'Filename', 'age'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'realization', 'stress', 'order2', 'age'.
## You can override using the `.groups` argument.
f1_age <- ggplot(sr5, aes(x= age, y = mean, fill = stress)) +
  geom_bar(stat = "identity", 
           position = position_dodge(preserve = "single"), 
           width = .7, 
           #fill = "gray", 
           color = "black") +
  labs(x = "Age", y = "Duration (ms)", fill = "Stress") +
      # , subtitle = "n ≥ 10") +
  geom_errorbar(aes(ymin = mean, ymax = max), 
                position = position_dodge(width = .7, preserve = "single"), 
                width = .4) +
  theme_bw() +
  facet_grid(area~realization) +
  scale_y_continuous(expand = c(0,0), limit = c(0, 200)) +
  theme(plot.title = element_text(size = 18, hjust = 0.5, vjust = 0.5),
        axis.text.x = element_text(size = 18),
        strip.text.x = element_text(size = 18),
        legend.title = element_text(size = 18),
        legend.text = element_text(size = 14),
        axis.title = element_text(size = 18),
        panel.background=element_rect(colour="black", size = 1),
        axis.text.y = element_text(size = 16)) +
      scale_fill_manual(
        values = c("#e63946", "#329cff", "#ffbf46"),
        limits = c("1", "2", "3"),
        breaks = c("1", "2", "3"),
        labels = c("S1", "S2", "S3")
  )

f1_age

#ggsave("f2_c.png", dpi = 600, width = 10, height = 5)

#statistics

ddd <- data3 %>%
  #filter(dm != "D") %>%
  #filter(giv != "soc") %>%
  #filter(area == "S") %>%
  #filter(gender == "F") %>%
  #filter(age == "Y") %>%
  #filter(POS !="function") %>%
  #filter(freq != "H") %>%
  #filter(stress == "1") %>%
  filter(realization == "[aɪ]") %>%
  mutate(realization = ifelse(realization == "[aɪ]", "0", "1")) %>%
  #mutate(gender = ifelse(gender == "M", "1", "0")) %>%
  #mutate(age = ifelse(age == "Y", "1", "0")) %>%
  #mutate(stress = ifelse(stress == "3", "1", "0")) %>%
  #mutate(area = ifelse(area == "S", "1", "0")) %>%
  #filter(realization == "0") %>%
  mutate(pos = ifelse(pos == "tail", "1", "0")) %>%
  mutate(onset1 = ifelse(onset1 == "cor", "1", "0")) %>%
  mutate(context = ifelse(context == "else", "1", "0")) %>%
  mutate(freq = as.factor(freq)) %>%
  mutate(pos = as.factor(pos)) %>%
  mutate(POS = as.factor(POS)) %>%
  mutate(context = as.factor(context)) %>%
  mutate(realization = as.factor(realization)) %>%
  mutate(Filename = as.factor(Filename)) %>%
  mutate(age = as.factor(age)) %>%
  mutate(area = as.factor(area)) %>%
  mutate(gender = as.factor(gender)) %>%
  mutate(stress = as.factor(stress))

#model_d <- lmer(dur~realization*giv*freq*stress + (1|Filename), data = ddd)
#model_d <- lmer(dur~realization*stress*POS+age*area*gender + (1|Filename), data = ddd)
model_d <- lmer(dur~stress*POS + (1|Filename), data = ddd)
#model_d <- lmer(dur~giv*stress+age*area*gender + (1|Filename), data = ddd)

coefs <- data.frame(coef(summary(model_d)))
coefs <- coefs %>%
  mutate(Estimate = round(Estimate, 4)) %>%
  mutate(Std..Error = round(Std..Error, 4)) %>%
  mutate(t.value = round(t.value, 4)) %>%
  #mutate(p.z = 2*(1-pnorm(abs(coefs$t.value))))
  mutate(p.z = ifelse(2*(1-pnorm(abs(coefs$t.value))) < 0.001, "<.001",
                      ifelse(2*(1-pnorm(abs(coefs$t.value))) < 0.01, "<.01",
                             ifelse(2*(1-pnorm(abs(coefs$t.value))) < 0.05, "<.05",
                                    round(2*(1-pnorm(abs(coefs$t.value))), 4)))))
coefs
##                     Estimate Std..Error t.value    p.z
## (Intercept)           0.1359     0.0019 69.7309  <.001
## stress1              -0.0175     0.0026 -6.7569  <.001
## stress3               0.0258     0.0025 10.3718  <.001
## POSfunction           0.0030     0.0018  1.7209 0.0853
## stress1:POSfunction  -0.0150     0.0053 -2.8315   <.01
## stress3:POSfunction   0.0174     0.0075  2.3115   <.05

#formant and stress [ai]

f_r <- data3 %>%
  select(age, gender, stress, area, Filename, dur, realization, freq, onset, onset1, tone, giv, order,
          context,  wl, pos, con, POS, p0_1, p0_2, p1_1, p1_2, p2_1, p2_2, p3_1, p3_2, 
         p4_1, p4_2, p5_1, p5_2, p6_1, p6_2, p7_1, p7_2, p8_1, p8_2, p9_1, p9_2) %>%
  mutate(order1 = ifelse(stress == "3", 4,
                        ifelse(stress == "2", 3,
                               ifelse(stress == "1", 2,
                                             1))))

fr_1 <- f_r %>%
  gather("time", "formant", -age, -gender, -tone, -giv, -order1,
         -stress, -area, -Filename, -dur, -realization, -order, -freq, -onset, -onset1,
          -context,  -wl, -pos, -con, -POS) %>%
  na.omit()

fr_1$t <- str_replace(fr_1$time, "_(1|2)", "")
fr_1$t <- str_replace(fr_1$t, "p", "")
fr_1$f <- str_replace(fr_1$time, "p(0|1|2|3|4|5|6|7|8|9)_", "")

fr_2 <- fr_1 %>%
  select(-time) %>%
  mutate(ag = paste0(area, age, gender)) %>%
  mutate(stress = ifelse(stress == 2, "S2", ifelse(stress == 3, "S3", "S1"))) %>%
  mutate(formant = as.numeric(formant)) %>%
  mutate(age = ifelse(age == "O", "Old", "Young")) %>%
  mutate(area = ifelse(area == "N", "North", "South")) %>%
  mutate(gender = ifelse(gender == "F", "Female", "Male")) %>%
  filter(realization == "[aɪ]") %>%
  mutate(stress = as.factor(stress)) %>%
  arrange(stress) %>%
  mutate(stress = fct_reorder(stress, order1)) %>%
  mutate(f = ifelse(f == 1, "F1", "F2")) %>%
  spread(f, formant) %>%
  filter(realization == "[aɪ]") %>%
  group_by(Filename, t) %>%
  filter(F1 %in% F1[abs(scale(F1)) <= 3], F2 %in% F2[abs(scale(F2)) <= 3]) %>%
  ungroup() %>%
  filter(t != 9) %>%
  filter(t != 0) 

stress plot

ai_s <- fr_2 %>%
  filter(freq != "M") %>%
  filter(realization == "[aɪ]") %>%
  group_by(Filename, t) %>%
  filter(F1 %in% F1[abs(scale(F1)) <= 3], F2 %in% F2[abs(scale(F2)) <= 3]) %>%
  ungroup() %>%
  filter(t != 9) %>%
  filter(t != 0) %>%
  group_by(Filename, age, gender, area, ag) %>%
  mutate(F1 = scale(F1), F2 = scale(F2)) %>%
  ungroup()

## Lobanov
ggplot(ai_s, aes(x = F2, y = F1, color = stress)) + 
  geom_smooth(aes(linetype = stress, fill = stress)) +
  theme_minimal() +
  scale_x_reverse() + 
  scale_y_reverse() + 
  facet_grid(age~area)+
  theme_bw() +
  scale_color_manual(labels = c("S1", "S2", "S3"),
                     values=c("#e63946", "#329cff", "#ffbf46")) +
  scale_fill_manual(labels = c("S1", "S2", "S3"),
                    values = c("#e63946", "#329cff", "#ffbf46")) +
  scale_linetype_manual(labels = c("S1", "S2", "S3"),
                     values=c("dashed", "solid", "longdash")) +
  theme(plot.title = element_text(size = 18, hjust = 0.5, vjust = 0.5),
        axis.text.x = element_text(size = 18),
        strip.text.x = element_text(size = 18),
        legend.title = element_text(size = 12),
        legend.text = element_text(size = 12),
        axis.title = element_text(size = 18),
        panel.background=element_rect(colour="black", size = 1),
        strip.background = element_rect(color = "black", size = 0.8),
        #legend.position = c(0.93, 0.4),
        legend.background = element_rect(fill="white",
                                  size=0.3, linetype="solid", 
                                  colour ="black"),
        axis.text.y = element_text(size = 16)) +
  scale_shape_manual(values = 1:10) + 
  ggtitle("Lobanov-transformed vowel plot")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

f2_f1 <- ggplot() +
  geom_smooth(data = ai_s, aes(x=t, y=F1, linetype = stress, 
                                group = stress, color = stress, fill = stress),
              size = 2, se = T) +
  scale_color_manual(labels = c("S1", "S2", "S3"),
                     values=c("#e63946", "#329cff", "#ffbf46")) +
  scale_fill_manual(labels = c("S1", "S2", "S3"),
                    values = c("#e63946", "#329cff", "#ffbf46")) +
  scale_linetype_manual(labels = c("S1", "S2", "S3"),
                     values=c("dashed", "solid", "longdash")) +
  labs(x = "Normalized time", y = "Frequency (z-score)", linetype = "Stress", color = "Stress",
       #title = "Female",
       fill = "Stress") +
  theme_bw() +
  #facet_grid(~gender) +
  theme(plot.title = element_text(size = 18, hjust = 0.5, vjust = 0.5),
        axis.text.x = element_text(size = 18),
        strip.text.x = element_text(size = 18),
        legend.title = element_text(size = 12),
        legend.text = element_text(size = 12),
        axis.title = element_text(size = 18),
        panel.background=element_rect(colour="black", size = 1),
        strip.background = element_rect(color = "black", size = 0.8),
        legend.background = element_rect(fill="white",
                                  size=0.3, linetype="solid", 
                                  colour ="black"),
        axis.text.y = element_text(size = 16))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
f2_f2 <- ggplot() +
  geom_smooth(data = ai_s, aes(x=t, y=F2, linetype = stress, group = stress, 
                               color = stress, fill = stress),
              size = 2, se = T) +
  scale_color_manual(labels = c("S1", "S2", "S3"),
                     values=c("#e63946", "#329cff", "#ffbf46")) +
  scale_fill_manual(labels = c("S1", "S2", "S3"),
                    values = c("#e63946", "#329cff", "#ffbf46")) +
  scale_linetype_manual(labels = c("S1", "S2", "S3"),
                     values=c("dashed", "solid", "longdash")) +
  labs(x = "Normalized time", y = "Frequency (z-score)", linetype = "Stress", color = "Stress",
       #title = "Female",
       fill = "Stress") +
  #facet_grid(~gender)+
  theme_bw() +
  theme(plot.title = element_text(size = 18, hjust = 0.5, vjust = 0.5),
        axis.text.x = element_text(size = 18),
        strip.text.x = element_text(size = 18),
        legend.title = element_text(size = 12),
        legend.text = element_text(size = 12),
        axis.title = element_text(size = 18),
        panel.background=element_rect(colour="black", size = 1),
        strip.background = element_rect(color = "black", size = 0.8),
        #legend.position = c(0.93, 0.4),
        legend.background = element_rect(fill="white",
                                  size=0.3, linetype="solid", 
                                  colour ="black"),
        axis.text.y = element_text(size = 16))

#ggsave("f3_z.png", dpi = 600, width = 10, height = 6)
f2_f1 <- ggplot() +
  geom_smooth(data = ai_s, aes(x=t, y=F1, linetype = stress, 
                                group = stress, color = stress, fill = stress),
              size = 2, se = T) +
  scale_color_manual(labels = c("S1", "S2", "S3"),
                     values=c("#e63946", "#329cff", "#ffbf46")) +
  scale_fill_manual(labels = c("S1", "S2", "S3"),
                    values = c("#e63946", "#329cff", "#ffbf46")) +
  scale_linetype_manual(labels = c("S1", "S2", "S3"),
                     values=c("dashed", "solid", "longdash")) +
  labs(x = "Normalized time", y = "Frequency (z-score)", linetype = "Stress", color = "Stress",
       fill = "Stress") +
  theme_bw() +
  theme(plot.title = element_text(size = 18, hjust = 0.5, vjust = 0.5),
        axis.text.x = element_text(size = 18),
        strip.text.x = element_text(size = 18),
        legend.title = element_text(size = 12),
        legend.text = element_text(size = 12),
        axis.title = element_text(size = 18),
        panel.background=element_rect(colour="black", size = 1),
        strip.background = element_rect(color = "black", size = 0.8),
        #legend.position = c(0.93, 0.4),
        legend.background = element_rect(fill="white",
                                  size=0.3, linetype="solid", 
                                  colour ="black"),
        axis.text.y = element_text(size = 16))

f2_f2 <- ggplot() +
  geom_smooth(data = ai_s, aes(x=t, y=F2, linetype = stress, group = stress, 
                               color = stress, fill = stress),
              size = 2, se = T) +
  scale_color_manual(labels = c("S1", "S2", "S3"),
                     values=c("#e63946", "#329cff", "#ffbf46")) +
  scale_fill_manual(labels = c("S1", "S2", "S3"),
                    values = c("#e63946", "#329cff", "#ffbf46")) +
  scale_linetype_manual(labels = c("S1", "S2", "S3"),
                     values=c("dashed", "solid", "longdash")) +
  labs(x = "Normalized time", y = "Frequency (z-score)", linetype = "Stress", color = "Stress",
       fill = "Stress") +
  theme_bw() +
  theme(plot.title = element_text(size = 18, hjust = 0.5, vjust = 0.5),
        axis.text.x = element_text(size = 18),
        strip.text.x = element_text(size = 18),
        legend.title = element_text(size = 12),
        legend.text = element_text(size = 12),
        axis.title = element_text(size = 18),
        panel.background=element_rect(colour="black", size = 1),
        strip.background = element_rect(color = "black", size = 0.8),
        #legend.position = c(0.93, 0.4),
        legend.background = element_rect(fill="white",
                                  size=0.3, linetype="solid", 
                                  colour ="black"),
        axis.text.y = element_text(size = 16))

The example

a_f <- ai_s %>%
  filter(t %in% c(1,2,3)) %>%
  group_by(Filename, order) %>%
  mutate(time = 1) %>%
  mutate(F1_max = max(F1)) %>%
  mutate(F2_min = min(F2)) %>%
  #select(-t) %>%
  distinct() %>%
  ungroup()

i_f <- ai_s %>%
  filter(t %in% c(6,7,8)) %>%
  group_by(Filename, order) %>%
  mutate(time = 2) %>%
  mutate(F1_min = min(F1)) %>%
  mutate(F2_max = max(F2)) %>%
  #select(-t) %>%
  distinct() %>%
  ungroup()

ai_f1 <- bind_rows(a_f, i_f)

ex <- ai_f1 %>%
  mutate(t = as.numeric(t)) %>%
  filter(Filename == "XMC") %>%
  filter(order == 42)

ex1 <- fr_2 %>%
  #filter(stress == "S3") %>%
  #filter(freq == "L") %>%
  mutate(t = as.numeric(t)) %>%
  filter(Filename == "XMC") %>%
  filter(order == 42)

highlighted_points <- data.frame(x = c(3, 8), y = c(809.9949, 538.4187))
hp <- data.frame(x = c(1, 7), y = c(1737.924, 2104.313))

f1 <- ggplot() +
  geom_point(data = ex1, aes(x = t, y = F1), color = "blue", size = 4) +
  geom_line(data = ex1, aes(x = t, y = F1), size = 1, color = "blue") +
  geom_rect(data = highlighted_points, 
            aes(xmin = x - 0.3, xmax = x + 0.3, ymin = y - 80, ymax = y + 80),
            color = "dark red", fill = NA, size = 1) +
  theme_bw() +
  scale_x_continuous(breaks = ex1$t) +
  labs(x = "Time",  y = "Formant frequencies (Hz)") +
  geom_point(data = ex1, aes(x = t, y = F2), color = "blue", size = 4) +
  geom_line(data = ex1, aes(x = t, y = F2), size = 1, color = "blue") +
  geom_rect(data = hp, 
            aes(xmin = x - 0.3, xmax = x + 0.3, ymin = y - 80, ymax = y + 80),
            color = "dark red", fill = NA, size = 1) +
  theme(plot.title = element_text(size = 16, hjust = 0.5, vjust = 0.5),
        axis.text.x = element_text(size = 14),
        strip.text.x = element_text(size = 16),
        legend.title = element_text(size = 12),
        legend.text = element_text(size = 12),
        axis.title = element_text(size = 16),
        panel.background=element_rect(colour="black", size = 1),
        strip.background = element_rect(color = "black", size = 0.8),
        legend.background = element_rect(fill="white",
                                  size=0.3, linetype="solid", 
                                  colour ="black"),
        axis.text.y = element_text(size = 12)) 

f1

#ggsave("ex1.png", dpi = 600, width = 8, height = 6)

stress and frequency plot

ai_s <- fr_2 %>%
  #filter(giv != "foc") %>%
  filter(POS == "content") %>%
  mutate(ra = paste(area, age))

a_s <- ai_s %>%
  filter(t %in% c(1,2,3)) %>%
  group_by(Filename, order) %>%
  mutate(time = 1) %>%
  mutate(F1 = max(F1)) %>%
  mutate(F2 = min(F2)) %>%
  select(-t) %>%
  distinct() %>%
  ungroup()

i_s <- ai_s %>%
  filter(t %in% c(6,7,8)) %>%
  group_by(Filename, order) %>%
  mutate(time = 2) %>%
  mutate(F1 = min(F1)) %>%
  mutate(F2 = max(F2)) %>%
  select(-t) %>%
  distinct() %>%
  ungroup()

ai_s1 <- bind_rows(a_s, i_s)

aim_s <- ai_s1 %>%
  group_by(time, 
           freq,
           giv,
           stress) %>%
  reframe(F1 = mean(F1), F2 = mean(F2)) %>%
  mutate(giv = ifelse(giv == "foc", "First-mention", "Second-mention")) %>%
  rename(Frequency = freq) %>%
  ungroup() 

## Lobanov
ggplot(aim_s, aes(x = F2, y = F1, color = stress)) + 
  geom_point(aes(shape = Frequency), size = 4) + 
  geom_line(aes(linetype = Frequency), linewidth = .8) +
  #geom_smooth(aes(linetype = stress, fill = stress)) +
  theme_minimal() +
  labs(x = "Normalized F2", y = "Normalized F1") +
  scale_x_reverse() + 
  scale_y_reverse() + 
  facet_grid(~giv)+
  theme_bw() +
  scale_color_manual(labels = c("S1", "S2", "S3"),
                     values=c("#e63946", "#329cff", "#ffbf46")) +
  scale_fill_manual(labels = c("S1", "S2", "S3"),
                    values = c("#e63946", "#329cff", "#ffbf46")) +
  theme(plot.title = element_text(size = 18, hjust = 0.5, vjust = 0.5),
        axis.text.x = element_text(size = 18),
        strip.text.x = element_text(size = 18),
        legend.title = element_text(size = 12),
        legend.text = element_text(size = 12),
        axis.title = element_text(size = 18),
        panel.background=element_rect(colour="black", size = 1),
        strip.background = element_rect(color = "black", size = 0.8),
        legend.background = element_rect(fill="white",
                                  size=0.3, linetype="solid", 
                                  colour ="black"),
        axis.text.y = element_text(size = 16)) +
  scale_shape_manual(values = 1:10) + 
  ggtitle("Lobanov-transformed vowel plot")

#ggsave("f3_freq.png", dpi = 600, width = 8, height = 5)

category plot

ai_c <- fr_2 %>%
  #filter(giv != "foc") %>%
  #filter(stress == "S2") %>%
  mutate(ra = paste(area, age))

a_c <- ai_c %>%
  filter(t %in% c(1,2,3)) %>%
  group_by(Filename, order) %>%
  mutate(time = 1) %>%
  mutate(F1 = max(F1)) %>%
  mutate(F2 = min(F2)) %>%
  select(-t) %>%
  distinct() %>%
  ungroup()

i_c <- ai_c %>%
  filter(t %in% c(6,7,8)) %>%
  group_by(Filename, order) %>%
  mutate(time = 2) %>%
  mutate(F1 = min(F1)) %>%
  mutate(F2 = max(F2)) %>%
  select(-t) %>%
  distinct() %>%
  ungroup()

ai_c1 <- bind_rows(a_c, i_c)

aim_c <- ai_c1 %>%
  group_by(time, stress, 
           POS) %>%
  reframe(F1 = mean(F1), F2 = mean(F2)) %>%
  ungroup()

## Lobanov
ggplot(aim_c, aes(x = F2, y = F1, color = stress)) + 
  geom_point(aes(shape = POS), size = 4) +
  geom_line(aes(linetype = POS), linewidth = .8) +
  #geom_smooth(aes(linetype = POS, fill = POS)) +
  theme_minimal() +
  labs(x = "Normalized F2", y = "Normalized F1") +
  scale_x_reverse() + 
  scale_y_reverse() + 
  theme_bw() +
  scale_color_manual(labels = c("S1", "S2", "S3"),
                     values=c("#e63946", "#329cff", "#ffbf46")) +
  scale_fill_manual(labels = c("S1", "S2", "S3"),
                    values = c("#e63946", "#329cff", "#ffbf46")) +
  scale_linetype_manual(labels = c("content", "function"),
                     values=c("solid", "longdash")) +
  theme(plot.title = element_text(size = 18, hjust = 0.5, vjust = 0.5),
        axis.text.x = element_text(size = 18),
        strip.text.x = element_text(size = 18),
        legend.title = element_text(size = 12),
        legend.text = element_text(size = 12),
        axis.title = element_text(size = 18),
        panel.background=element_rect(colour="black", size = 1),
        strip.background = element_rect(color = "black", size = 0.8),
        legend.background = element_rect(fill="white",
                                  size=0.3, linetype="solid", 
                                  colour ="black"),
        axis.text.y = element_text(size = 16)) +
  scale_shape_manual(values = 1:10) + 
  ggtitle("Lobanov-transformed vowel plot")

#ggsave("f3_c.png", dpi = 600, width = 8, height = 5)
f2_f1 <- ggplot() +
  geom_smooth(data = ai_c, aes(x=t, y=F1, linetype = POS, group = POS, color = POS, fill = POS),
              size = 2, se = T) +
  scale_color_manual(labels = c("content", "function"),
                    values = c("#329cff", "#e63946")) +
  scale_fill_manual(labels = c("content", "function"),
                    values = c("#329cff", "#e63946")) +
  scale_linetype_manual(labels = c("content", "function"),
                     values=c("solid", "longdash")) +
  labs(x = "Normalized time", y = "Frequency (z-score)", linetype = "Word category", 
       color = "Word category", 
       fill = "Word category") +
  theme_bw() +
  theme(plot.title = element_text(size = 18, hjust = 0.5, vjust = 0.5),
        axis.text.x = element_text(size = 18),
        strip.text.x = element_text(size = 18),
        legend.title = element_text(size = 12),
        legend.text = element_text(size = 12),
        axis.title = element_text(size = 18),
        panel.background=element_rect(colour="black", size = 1),
        strip.background = element_rect(color = "black", size = 0.8),
        legend.background = element_rect(fill="white",
                                  size=0.3, linetype="solid", 
                                  colour ="black"),
        axis.text.y = element_text(size = 16))

f2_f2 <- ggplot() +
  geom_smooth(data = ai_c, aes(x=t, y=F2, linetype = POS, group = POS, color = POS, fill = POS),
              size = 2, se = T) +
  scale_color_manual(labels = c("content", "function"),
                    values = c("#329cff", "#e63946")) +
  scale_fill_manual(labels = c("content", "function"),
                    values = c("#329cff", "#e63946")) +
  scale_linetype_manual(labels = c("content", "function"),
                     values=c("solid", "longdash")) +
  labs(x = "", y = "Frequency (z-score)", linetype = "Word category", 
       color = "Word category", 
       fill = "Word category") +
  theme_bw() +
  theme(plot.title = element_text(size = 18, hjust = 0.5, vjust = 0.5),
        axis.text.x = element_text(size = 18),
        strip.text.x = element_text(size = 18),
        legend.title = element_text(size = 12),
        legend.text = element_text(size = 12),
        axis.title = element_text(size = 18),
        panel.background=element_rect(colour="black", size = 1),
        strip.background = element_rect(color = "black", size = 0.8),
        legend.background = element_rect(fill="white",
                                  size=0.3, linetype="solid", 
                                  colour ="black"),
        axis.text.y = element_text(size = 16))

frequency plot

ai_f <- fr_2 %>%
  filter(giv != "foc") %>%
  filter(POS == "content") %>%
  filter(stress == "S2") %>%
  mutate(freq = ifelse(freq == "H", "High", "Low")) %>%
  mutate(ra = paste(area, age))
 
a_f <- ai_f %>%
  filter(t %in% c(1,2,3)) %>%
  group_by(Filename, order) %>%
  mutate(time = 1) %>%
  mutate(F1 = max(F1)) %>%
  mutate(F2 = min(F2)) %>%
  select(-t) %>%
  distinct() %>%
  ungroup()

i_f <- ai_f %>%
  filter(t %in% c(6,7,8)) %>%
  group_by(Filename, order) %>%
  mutate(time = 2) %>%
  mutate(F1 = min(F1)) %>%
  mutate(F2 = max(F2)) %>%
  select(-t) %>%
  distinct() %>%
  ungroup()

ai_f1 <- bind_rows(a_f, i_f)

aim_f <- ai_f1 %>%
  group_by(time, freq) %>%
  reframe(F1 = mean(F1), F2 = mean(F2)) %>%
  ungroup()

## Lobanov
ggplot(aim_f, aes(x = F2, y = F1, color = freq)) + 
  geom_point(aes(shape = freq)) + 
  geom_line(aes(linetype = freq)) +
  theme_minimal() +
  scale_x_reverse() + 
  scale_y_reverse() + 
  theme_bw() +
  scale_color_manual(labels = c("High", "Low"),
                    values = c("green","#329cff")) +
  scale_fill_manual(labels = c("High", "Low"),
                    values = c("green","#329cff")) +
  scale_linetype_manual(labels = c("High", "Low"),
                     values=c("longdash", "solid")) +
  theme(plot.title = element_text(size = 18, hjust = 0.5, vjust = 0.5),
        axis.text.x = element_text(size = 18),
        strip.text.x = element_text(size = 18),
        legend.title = element_text(size = 12),
        legend.text = element_text(size = 12),
        axis.title = element_text(size = 18),
        panel.background=element_rect(colour="black", size = 1),
        strip.background = element_rect(color = "black", size = 0.8),
        #legend.position = c(0.93, 0.4),
        legend.background = element_rect(fill="white",
                                  size=0.3, linetype="solid", 
                                  colour ="black"),
        axis.text.y = element_text(size = 16)) +
  scale_shape_manual(values = 1:10) + 
  ggtitle("Lobanov-transformed vowel plot") 

#ggsave("f3_z.png", dpi = 600, width = 10, height = 6)

STATS for Standardized Formants

ai_st <- ai_c1 %>%
  filter(giv == "foc") %>%
  #filter(stress == "S1") %>%
  mutate(stress = ifelse(stress == "S1", 2, ifelse(stress == "S2", 1, 3))) %>%
  mutate(stress = as.factor(stress)) %>%
  mutate(area = as.factor(area)) %>%
  #filter(gender == "Female") %>%
  #filter(age != "Old") %>%
  #filter(area == "North") %>%
  filter(POS != "function") %>%
  filter(freq != "H") %>%
  filter(time == 1) %>%
  mutate(freq = as.factor(freq)) %>%
  mutate(POS = as.factor(POS)) %>%
  #mutate(time = as.numeric(time)) %>%
  filter(realization == "[aɪ]") 

#model_f <- lmer(F2~freq*t + (1 | Filename), data = ai_f1_t)
model_f <- lmer(F1~stress + age*gender*area + (1 | Filename), data = ai_st)
#model_f <- lmer(F1~POS*stress + age*gender*area + (1 | Filename), data = ai_st)
#model_f <- lmer(F1~area*gender*age  + (1 | Filename), data = ai_st)
#model_f <- lmer(F1~stress + age*gender*area + (1 | Filename), data = ai_st)
#model_f <- lmer(F2~area*t  + (1 | Filename), data = ai_st)
#model_f <- lmer(formant~stress*t + (1 | Filename), data = ai_f1_t)

coefs <- data.frame(coef(summary(model_f)))
coefs <- coefs %>%
  mutate(Estimate = round(Estimate, 4)) %>%
  mutate(Std..Error = round(Std..Error, 4)) %>%
  mutate(t.value = round(t.value, 4)) %>%
  mutate(p.z = round(2*(1-pnorm(abs(coefs$t.value))), 4)) %>%
  mutate(p.z = ifelse(2*(1-pnorm(abs(coefs$t.value))) < 0.001, "<.001",
                      ifelse(2*(1-pnorm(abs(coefs$t.value))) < 0.01, "<.01",
                             ifelse(2*(1-pnorm(abs(coefs$t.value))) < 0.05, "<.05",
                                    round(2*(1-pnorm(abs(coefs$t.value))), 4)))))
coefs
##                                Estimate Std..Error t.value    p.z
## (Intercept)                    909.2039    31.8001 28.5912  <.001
## stress2                         -9.6881    10.3224 -0.9385  0.348
## stress3                         49.6610     7.5243  6.6001  <.001
## ageYoung                       -60.2213    44.8608 -1.3424 0.1795
## genderMale                    -233.2704    44.9772 -5.1864  <.001
## areaSouth                      -40.1602    48.7771 -0.8233 0.4103
## ageYoung:genderMale            137.6632    63.5721  2.1655   <.05
## ageYoung:areaSouth             130.5093    66.3142  1.9680   <.05
## genderMale:areaSouth            93.7502    66.3456  1.4131 0.1576
## ageYoung:genderMale:areaSouth -213.4432    91.9026 -2.3225   <.05
#summary(model_f)
ai_age <- fr_2 %>%
  filter(t != 9) %>%
  filter(t != 0) %>%
  group_by(Filename, t) %>%
  filter(F1 %in% F1[abs(scale(F1)) <= 3], F2 %in% F2[abs(scale(F2)) <= 3]) %>%
  ungroup() %>%
  group_by(Filename, age, area, ag, t) %>%
  mutate(F1 = scale(F1), F2 = scale(F2)) %>%
  ungroup() %>%
  #filter(giv == "foc") %>%
  #filter(stress == "S2") %>%
  filter(POS == "content") %>%
  #filter(context == "else") %>%
  #filter(onset1 == "dor") %>%
  filter(realization == "[aɪ]") %>%
  mutate(ra = paste(area, age))

a_age <- ai_age %>%
  filter(t %in% c(1,2,3)) %>%
  group_by(Filename, order) %>%
  mutate(time = 1) %>%
  mutate(F1 = max(F1)) %>%
  mutate(F2 = min(F2)) %>%
  select(-t) %>%
  distinct() %>%
  ungroup() #%>%
  # group_by(Filename) %>%
  # mutate(num = seq(1,length(Filename))) %>%
  # ungroup() %>%
  # filter(num <= 20)

i_age <- ai_age %>%
  filter(t %in% c(6,7,8)) %>%
  group_by(Filename, order) %>%
  mutate(time = 2) %>%
  mutate(F1 = min(F1)) %>%
  mutate(F2 = max(F2)) %>%
  select(-t) %>%
  distinct() %>%
  ungroup() #%>%
  # group_by(Filename) %>%
  # mutate(num = seq(1,length(Filename))) %>%
  # ungroup() %>%
  # filter(num <= 20)

ai_age1 <- bind_rows(a_age, i_age)

aim_age <- ai_age1 %>%
  group_by(age, area, time) %>%
  reframe(F1 = mean(F1), F2 = mean(F2)) %>%
  ungroup() 

ggplot(aim_age, aes(x = F2, y = F1, 
                    color = area
                    )) + 
  geom_point(aes(shape = area), size = 4) + 
  geom_line(aes(linetype = area) , linewidth = .8) +
  #geom_smooth(aes(linetype = age, fill = age)) +
  theme_minimal() +
  labs(x = "Normalized F2", y = "Normalized F1") +
  scale_x_reverse() + 
  scale_y_reverse() + 
  scale_color_manual(labels = c("North", "South"),
                    values = c("#329cff","#ff006e")) +
  scale_fill_manual(labels = c("North", "South"),
                    values = c("#329cff","#ff006e")) +
  scale_linetype_manual(labels = c("North", "South"),
                     values=c("solid", "longdash")) +
  theme_bw() +
  facet_grid(~age) +
  theme(plot.title = element_text(size = 18, hjust = 0.5, vjust = 0.5),
        axis.text.x = element_text(size = 18),
        strip.text.x = element_text(size = 18),
        legend.title = element_text(size = 12),
        legend.text = element_text(size = 12),
        axis.title = element_text(size = 18),
        panel.background=element_rect(colour="black", size = 1),
        strip.background = element_rect(color = "black", size = 0.8),
        #legend.position = c(0.93, 0.4),
        legend.background = element_rect(fill="white",
                                  size=0.3, linetype="solid", 
                                  colour ="black"),
        axis.text.y = element_text(size = 16)) +
  scale_shape_manual(values = 1:10) + 
  ggtitle("Lobanov-transformed vowel plot")

#ggsave("f3_age.png", dpi = 600, width = 8, height = 4)
ai_area <- fr_2 %>%
  filter(freq != "M") %>%
  filter(stress == "S3") %>%
  filter(POS == "content") %>%
  #filter(freq == "L") %>%
  mutate(freq = ifelse(freq == "H", "High", "Low")) %>%
  filter(realization == "[aɪ]") %>%
  mutate(order3 = ifelse(area == "South", 1, 2)) %>%
  group_by(Filename, t) %>%
  filter(F1 %in% F1[abs(scale(F1)) <= 3], F2 %in% F2[abs(scale(F2)) <= 3]) %>%
  ungroup() %>%
  filter(t != 9) %>%
  filter(t != 0) %>%
  group_by(Filename, age, gender, area, ag) %>%
  mutate(F1 = scale(F1), F2 = scale(F2)) %>%
  mutate(region = area) %>%
  mutate(freq = as.factor(area)) %>%
  arrange(area) %>%
  mutate(area = fct_reorder(area, order3)) %>%
  filter(F1 <= 6) %>%
  ungroup() 
## Warning: Using one column matrices in `filter()` was deprecated in dplyr 1.1.0.
## ℹ Please use one dimensional logical vectors instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ggplot(ai_area, aes(x = F2, y = F1, color = area)) + 
  #geom_point(aes(shape = stress),alpha = 0.5) + 
  geom_smooth(aes(linetype = area, fill = area)) +
  #stat_ellipse() + 
  theme_minimal() +
  scale_x_reverse() + 
  scale_y_reverse() + 
  scale_color_manual(labels = c("North", "South"),
                    values = c("#329cff", "#fb5607")) +
  scale_fill_manual(labels = c("North", "South"),
                    values = c("#329cff", "#fb5607")) +
  scale_linetype_manual(labels = c("North", "South"),
                     values=c("solid","longdash")) +
  facet_grid(gender~age) +
  theme_bw() +
  theme(plot.title = element_text(size = 18, hjust = 0.5, vjust = 0.5),
        axis.text.x = element_text(size = 18),
        strip.text.x = element_text(size = 18),
        legend.title = element_text(size = 12),
        legend.text = element_text(size = 12),
        axis.title = element_text(size = 18),
        panel.background=element_rect(colour="black", size = 1),
        strip.background = element_rect(color = "black", size = 0.8),
        #legend.position = c(0.93, 0.4),
        legend.background = element_rect(fill="white",
                                  size=0.3, linetype="solid", 
                                  colour ="black"),
        axis.text.y = element_text(size = 16)) +
  scale_shape_manual(values = 1:10) + 
  ggtitle("Lobanov-transformed vowel plot")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

f2_f1 <- ggplot() +
  geom_smooth(data = ai_area, aes(x=t, y=F1, linetype = area, group = area, 
                               color = area, fill = area),
              size = 2, se = T) +
  scale_color_manual(labels = c("North", "South"),
                    values = c("#ff006e", "#329cff")) +
  scale_fill_manual(labels = c("North", "South"),
                    values = c("#ff006e", "#329cff")) +
  scale_linetype_manual(labels = c("North", "South"),
                     values=c("longdash","solid")) +
  labs(x = "", y = "Frequency (z-score)", linetype = "Region", 
       color = "Region", 
       fill = "Region") +
  theme_bw() +
  theme(plot.title = element_text(size = 18, hjust = 0.5, vjust = 0.5),
        axis.text.x = element_text(size = 18),
        strip.text.x = element_text(size = 18),
        legend.title = element_text(size = 12),
        legend.text = element_text(size = 12),
        axis.title = element_text(size = 18),
        panel.background=element_rect(colour="black", size = 1),
        strip.background = element_rect(color = "black", size = 0.8),
        #legend.position = c(0.93, 0.4),
        legend.background = element_rect(fill="white",
                                  size=0.3, linetype="solid", 
                                  colour ="black"),
        axis.text.y = element_text(size = 16))

f2_f2 <- ggplot() +
  geom_smooth(data = ai_age, aes(x=t, y=F2, linetype = area, group = area, 
                               color = area, fill = area),
              size = 2, se = T) +
  scale_color_manual(labels = c("North", "South"),
                    values = c("#329cff", "#fb5607")) +
  scale_fill_manual(labels = c("North", "South"),
                    values = c("#329cff", "#fb5607")) +
  scale_linetype_manual(labels = c("North", "South"),
                     values=c("solid","longdash")) +
  labs(x = "", y = "Frequency (z-score)", linetype = "Region", 
       color = "Region", 
       #title = "Female", 
       fill = "Region") +
  #facet_grid(~gender)+
  theme_bw() +
  theme(plot.title = element_text(size = 18, hjust = 0.5, vjust = 0.5),
        axis.text.x = element_text(size = 18),
        strip.text.x = element_text(size = 18),
        legend.title = element_text(size = 12),
        legend.text = element_text(size = 12),
        axis.title = element_text(size = 18),
        panel.background=element_rect(colour="black", size = 1),
        strip.background = element_rect(color = "black", size = 0.8),
        #legend.position = c(0.93, 0.4),
        legend.background = element_rect(fill="white",
                                  size=0.3, linetype="solid", 
                                  colour ="black"),
        axis.text.y = element_text(size = 16))

#ggsave("f3_area.png", dpi = 600, width = 8, height = 5)

#tonal realization (stress)

tr <- data3 %>%
  select(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, Filename, age, gender, area, freq, realization, stress, label, order, dur, onset, onset1, context, wl, pos, con, POS, n, tone) %>%
  mutate(order = ifelse(stress == "3", 4,
                        ifelse(stress == "2", 3,
                               ifelse(stress == "1", 2,
                                             1))))
tr1 <- tr %>%
  gather("time", "pitch", -age, -gender,
         -stress, -area, -Filename, -dur, -realization, -order, -freq, -onset, -onset1,
          -context,  -wl, -pos, -con, -POS, -label, -n, -tone)

tr1$t <- str_replace(tr1$time, "p", "")

tr2 <- tr1 %>%
  filter(stress != 0) %>%
  filter(tone != 0) %>%
  #filter(pitch != is.na(pitch)) %>%
  na.omit() %>%
  mutate(ag = paste0(area, age, gender)) %>%
  group_by(Filename, age, gender, area, ag) %>%
  mutate(mean = mean(pitch), z = scale(pitch)) %>%
  ungroup() %>%
  mutate(tone = paste0("T", tone)) %>%
  mutate(stress = ifelse(stress == 2, "S2", ifelse(stress == 3, "S3", "S1"))) %>%
  mutate(stress = as.factor(stress)) %>%
  arrange(stress) %>%
  mutate(stress = fct_reorder(stress, order))

f_pm <- ggplot() +
  geom_smooth(data = tr2,
              aes(x=t, y=z, color = stress, linetype = stress, group = stress,
                  fill = stress), linewidth = 1, se = T) +
  scale_color_manual(labels = c("S1", "S2", "S3"),
                     values=c("#e63946", "#329cff", "#ffbf46")) +
  scale_fill_manual(labels = c("S1", "S2", "S3"),
                    values = c("#e63946", "#329cff", "#ffbf46")) +
  scale_linetype_manual(labels = c("S1", "S2", "S3"),
                     values=c("dashed", "solid", "longdash")) +
  labs(x = "Normalized time", y = "Pitch (z-score)", linetype = "Stress", color = "Stress", fill = "Stress") +
  facet_grid(~tone)+
  theme_bw() +
  theme(plot.title = element_text(size = 18, hjust = 0.5, vjust = 0.5),
        axis.text.x = element_text(size = 14),
        strip.text.x = element_text(size = 22),
        legend.title = element_text(size = 18),
        legend.text = element_text(size = 14),
        axis.title = element_text(size = 18),
        axis.text.y = element_text(size = 14))

f_pm
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

#ggsave("f7.png", dpi = 600, width = 10, height = 5)

STATS for PITCH: GAMs

library(mgcv)
## Warning: package 'mgcv' was built under R version 4.3.1
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:lme4':
## 
##     lmList
## The following object is masked from 'package:dplyr':
## 
##     collapse
## This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.
library(plotly)
## Warning: package 'plotly' was built under R version 4.3.1
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(itsadug)
## Loading required package: plotfunctions
## 
## Attaching package: 'plotfunctions'
## The following object is masked from 'package:plotly':
## 
##     add_bars
## The following object is masked from 'package:RVAideMemoire':
## 
##     se
## The following object is masked from 'package:ggpubr':
## 
##     get_palette
## The following object is masked from 'package:ggplot2':
## 
##     alpha
## Loaded package itsadug 2.4 (see 'help("itsadug")' ).
pr1 <- tr2 %>%
  mutate(tone = as.factor(tone)) %>%
  mutate(area = as.factor(area)) %>%
  mutate(age = as.factor(age)) %>%
  mutate(Filename= as.factor(Filename)) %>%
  filter(tone == "T1") %>%
  mutate(stress = ifelse(stress == "S2", "R1", ifelse(stress == "S1", "R2", "R3"))) %>%
  mutate(stress = as.factor(stress)) %>%
  mutate(t = as.numeric(t)) 

pr2 <- tr2 %>%
  mutate(tone = as.factor(tone)) %>%
  mutate(area = as.factor(area)) %>%
  mutate(age = as.factor(age)) %>%
  mutate(Filename= as.factor(Filename)) %>%
  filter(tone == "T2") %>%
  mutate(stress = ifelse(stress == "S2", "R1", ifelse(stress == "S1", "R2", "R3"))) %>%
  mutate(stress = as.factor(stress)) %>%
  mutate(t = as.numeric(t)) 

pr3 <- tr2 %>%
  mutate(tone = as.factor(tone)) %>%
  mutate(area = as.factor(area)) %>%
  mutate(age = as.factor(age)) %>%
  mutate(Filename= as.factor(Filename)) %>%
  filter(tone == "T3") %>%
  mutate(stress = ifelse(stress == "S2", "R1", ifelse(stress == "S1", "R2", "R3"))) %>%
  mutate(stress = as.factor(stress)) %>%
  mutate(t = as.numeric(t)) 

pr4 <- tr2 %>%
  mutate(tone = as.factor(tone)) %>%
  mutate(area = as.factor(area)) %>%
  mutate(age = as.factor(age)) %>%
  mutate(Filename= as.factor(Filename)) %>%
  filter(tone == "T4") %>%
  mutate(stress = ifelse(stress == "S2", "R1", ifelse(stress == "S1", "R2", "R3"))) %>%
  mutate(stress = as.factor(stress)) %>%
  mutate(t = as.numeric(t)) 

gam_model1 <- bam(z ~ stress + s(t, by = stress) + s(t, Filename, bs="fs", m=1), data = pr1)
gam_model2 <- bam(z ~ stress + s(t, by = stress) + s(t, Filename, bs="fs", m=1), data = pr2)
gam_model3 <- bam(z ~ stress + s(t, by = stress) + s(t, Filename, bs="fs", m=1), data = pr3)
gam_model4 <- bam(z ~ stress + s(t, by = stress) + s(t, Filename, bs="fs", m=1), data = pr4)

# Plot the GAM
#itsadug::plot_smooth(gam_model1, view="t", plot_all="stress", rm.ranef=TRUE)
dev.new(width = 8, height = 15, unit="in")
par(mar=c(2,2,2,2))
layout(matrix(c(1, 2, 3, 4, 5, 6, 7, 8), nrow = 4,
              ncol = 2, byrow = TRUE), 
       widths=c(1, 1), heights=c(1, 1, 1, 1)) 
itsadug::plot_diff(gam_model1, comp = list(stress=c("R1","R2")), rm.ranef=F, view="t",
                   hide.label =  TRUE, main = "Tone1: S2 - S1 difference")
## Summary:
##  * t : numeric predictor; with 100 values ranging from 0.000000 to 9.000000. 
##  * Filename : factor; set to the value(s): GYX.
## 
## t window(s) of significant difference(s):
##  4.909091 - 9.000000
itsadug::plot_diff(gam_model1, comp = list(stress=c("R3","R2")), rm.ranef=F, view="t",
                   hide.label =  TRUE, main = "Tone1: S3 - S2 difference")
## Summary:
##  * t : numeric predictor; with 100 values ranging from 0.000000 to 9.000000. 
##  * Filename : factor; set to the value(s): GYX.
## 
## t window(s) of significant difference(s):
##  0.000000 - 9.000000
itsadug::plot_diff(gam_model2, comp = list(stress=c("R1","R2")), rm.ranef=F, view="t",
                   hide.label =  TRUE, main = "Tone2: S2 - S1 difference")
## Summary:
##  * t : numeric predictor; with 100 values ranging from 0.000000 to 9.000000. 
##  * Filename : factor; set to the value(s): LJS.
## 
## t window(s) of significant difference(s):
##  0.000000 - 5.909091
##  6.818182 - 9.000000
itsadug::plot_diff(gam_model2, comp = list(stress=c("R3","R2")), rm.ranef=F, view="t",
                   hide.label =  TRUE, main = "Tone2: S3 - S2 difference")
## Summary:
##  * t : numeric predictor; with 100 values ranging from 0.000000 to 9.000000. 
##  * Filename : factor; set to the value(s): LJS.
## 
## t window(s) of significant difference(s):
##  0.000000 - 2.090909
##  3.363636 - 9.000000
itsadug::plot_diff(gam_model3, comp = list(stress=c("R1","R2")), rm.ranef=F, view="t",
                   hide.label =  TRUE, main = "Tone3: S2 - S1 difference")
## Summary:
##  * t : numeric predictor; with 100 values ranging from 0.000000 to 9.000000. 
##  * Filename : factor; set to the value(s): CZX1.
## 
## t window(s) of significant difference(s):
##  1.454545 - 8.454545
itsadug::plot_diff(gam_model3, comp = list(stress=c("R3","R2")), rm.ranef=F, view="t",
                   hide.label =  TRUE, main = "Tone3: S3 - S2 difference")
## Summary:
##  * t : numeric predictor; with 100 values ranging from 0.000000 to 9.000000. 
##  * Filename : factor; set to the value(s): CZX1.
## 
## t window(s) of significant difference(s):
##  1.090909 - 8.636364
itsadug::plot_diff(gam_model4, comp = list(stress=c("R1","R2")), rm.ranef=F, view="t",
                   hide.label =  TRUE, main = "Tone4: S2 - S1 difference")
## Summary:
##  * t : numeric predictor; with 100 values ranging from 0.000000 to 9.000000. 
##  * Filename : factor; set to the value(s): XMC.
## 
## t window(s) of significant difference(s):
##  0.000000 - 3.454545
##  4.272727 - 9.000000
itsadug::plot_diff(gam_model4, comp = list(stress=c("R3","R2")), rm.ranef=F, view="t",
                   hide.label =  TRUE, main = "Tone4: S3 - S2 difference")
## Summary:
##  * t : numeric predictor; with 100 values ranging from 0.000000 to 9.000000. 
##  * Filename : factor; set to the value(s): XMC.
## 
## t window(s) of significant difference(s):
##  0.000000 - 3.636364
##  4.545455 - 9.000000
# Summary of the GAM
summary(gam_model1)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## z ~ stress + s(t, by = stress) + s(t, Filename, bs = "fs", m = 1)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.28402    0.04538   6.259 4.12e-10 ***
## stressR2    -0.15046    0.05423  -2.774  0.00555 ** 
## stressR3     0.90618    0.03515  25.781  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                  edf  Ref.df      F p-value    
## s(t):stressR1  3.028   3.764 18.077 < 2e-16 ***
## s(t):stressR2  1.000   1.000 10.185 0.00142 ** 
## s(t):stressR3  1.630   2.023  3.000 0.05166 .  
## s(t,Filename) 27.223 278.000  1.176 < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.147   Deviance explained = 15.2%
## fREML = 8775.5  Scale est. = 0.87923   n = 6439
summary(gam_model2)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## z ~ stress + s(t, by = stress) + s(t, Filename, bs = "fs", m = 1)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.39616    0.02010  -19.71   <2e-16 ***
## stressR2     0.12913    0.01110   11.64   <2e-16 ***
## stressR3     0.57419    0.02849   20.15   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                  edf  Ref.df      F p-value    
## s(t):stressR1  5.081   6.119 23.139  <2e-16 ***
## s(t):stressR2  2.833   3.475 44.968  <2e-16 ***
## s(t):stressR3  3.609   4.471 38.713  <2e-16 ***
## s(t,Filename) 93.139 278.000  2.942  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0622   Deviance explained = 6.52%
## fREML =  40686  Scale est. = 0.64949   n = 33716
summary(gam_model3)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## z ~ stress + s(t, by = stress) + s(t, Filename, bs = "fs", m = 1)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.61686    0.11783  -5.235 1.79e-07 ***
## stressR2     0.31766    0.08078   3.933 8.64e-05 ***
## stressR3    -0.27278    0.04738  -5.757 9.64e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                  edf  Ref.df      F p-value    
## s(t):stressR1  3.510   4.283 14.301  <2e-16 ***
## s(t):stressR2  1.000   1.000  1.777   0.183    
## s(t):stressR3  3.703   4.566 10.755  <2e-16 ***
## s(t,Filename) 67.381 272.000  2.022  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.228   Deviance explained = 25.1%
## fREML = 2908.9  Scale est. = 0.54169   n = 2525
summary(gam_model4)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## z ~ stress + s(t, by = stress) + s(t, Filename, bs = "fs", m = 1)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.28118    0.01755   16.02   <2e-16 ***
## stressR2     0.03675    0.01612    2.28   0.0226 *  
## stressR3     0.01536    0.02647    0.58   0.5617    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                  edf  Ref.df       F p-value    
## s(t):stressR1  2.855   3.486 372.268  <2e-16 ***
## s(t):stressR2  1.000   1.000   1.508   0.219    
## s(t):stressR3  2.967   3.686 141.601  <2e-16 ***
## s(t,Filename) 69.773 278.000   1.619  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0978   Deviance explained = 9.96%
## fREML =  53744  Scale est. = 0.91783   n = 38993

================================================================================= #citation forms

ai_GY <- read_csv("data - read.csv")
## Rows: 1638 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): Filename, Segment_label
## dbl (21): dur, p0_1, p0_2, p1_1, p1_2, p2_1, p2_2, p3_1, p3_2, p4_1, p4_2, p...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#ai_GY <- read_csv("read330.csv")
#ai_GY <- read_csv("data - read2.csv")
ai_GY1 <- ai_GY %>%
  #na.omit() %>%
  gather("time", "formant", -Filename, -dur, -Segment_label)

##data pre-processing
ai_GY1$t <- str_replace(ai_GY1$time, "_(1|2)", "")
ai_GY1$t <- str_replace(ai_GY1$t, "p", "")
ai_GY1$f <- str_replace(ai_GY1$time, "p(0|1|2|3|4|5|6|7|8|9)_", "")
ai_GY1$lan <- str_replace(ai_GY1$Filename, "^[A-Z]+_[A-Z]+_[a-z]+_", "")
ai_GY1$Filename <- str_replace(ai_GY1$Filename, "_GY", "")
ai_GY1$Filename <- str_replace(ai_GY1$Filename, "_TW", "")
ai_GY1$group <- str_replace(ai_GY1$Filename, "_[A-Z]+_[a-z]+", "")
ai_GY1$Filename <- str_replace(ai_GY1$Filename, "^[SNYOFM]{3}_", "")
ai_GY1$word <- str_replace(ai_GY1$Filename, "[A-Z]+_", "")
ai_GY1$Filename <- str_replace(ai_GY1$Filename, "_[a-z]+", "")
ai_GY1$gender <- str_replace(ai_GY1$group, "[SNYO]{2}", "")
ai_GY1$area <- str_replace(ai_GY1$group, "[YOFM]{2}", "")
ai_GY1$age <- str_replace(ai_GY1$group, "[SN]", "")
ai_GY1$age <- str_replace(ai_GY1$age, "[FM]", "")

ai_GY2 <- ai_GY1 %>%
  filter(Filename != "ZMH") %>%
  mutate(vowel = "ai")

##contextual influence
ai_word <- read_csv("read_word.csv")
## Rows: 40 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): word, con
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ai_read <- right_join(ai_GY2, ai_word, by = "word")
fluency <- read_csv("fluency.csv")
## Rows: 40 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Filename
## dbl (2): TW, CN
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ai_read2 <- right_join(ai_read, fluency, by = "Filename")
ai_read1 <- ai_read2 %>%
  filter(Segment_label == "ai") %>%
  mutate(proficiency = TW/CN) %>%
  mutate(context = ifelse(con %in% c("zh", "ch", "sh", "r", "z", "c", "s", "i", "y", "d", "t", "n", "l", "j", "q", "x"), "cor",
                           ifelse(con %in% c("b", "p", "m", "f", "w"), "lab",
                                  ifelse(con %in% c("g", "k", "h", "o", "u", "e", "a"), "dor", "else"))))

##language plot

ai_read_f1 <- ai_read1 %>%
  mutate(ag = paste0(area, age, gender)) %>%
  mutate(age = ifelse(age == "O", "Old", "Young")) %>%
  mutate(area = ifelse(area == "N", "North", "South")) %>%
  mutate(gender = ifelse(gender == "F", "Female", "Male")) %>%
  mutate(ra = paste(area, age)) %>%
  mutate(lan = ifelse(lan == "TW", "Min", "Mandarin")) %>%
  select(-time) %>%
  mutate(f = ifelse(f == 1, "F1", "F2")) %>%
  filter(formant != is.na(formant)) %>%
  spread(f, formant) %>%
  filter(t != 0) %>%
  #filter(t != 9) %>%
  group_by(Filename, t) %>%
  filter(F1 %in% F1[abs(scale(F1)) <= 3], F2 %in% F2[abs(scale(F2)) <= 3]) %>%
  ungroup() %>%
  group_by(Filename, age, gender, area, ag, t) %>%
  mutate(F1 = scale(F1), F2 = scale(F2)) %>%
  rename(Region = area) %>%
  rename(Language = lan) %>%
  ungroup() 

a <- ai_read_f1 %>%
  filter(t %in% c(1,2,3)) %>%
  group_by(Filename, word) %>%
  mutate(time = 1) %>%
  mutate(F1 = max(F1)) %>%
  mutate(F2 = min(F2)) %>%
  select(-t) %>%
  distinct() %>%
  ungroup()

i <- ai_read_f1 %>%
  filter(t %in% c(6,7,8,9)) %>%
  group_by(Filename, word) %>%
  mutate(time = 2) %>%
  mutate(F1 = min(F1)) %>%
  mutate(F2 = max(F2)) %>%
  select(-t) %>%
  distinct() %>%
  ungroup()

ai1 <- bind_rows(a, i)

aim <- ai1 %>%
  group_by(gender, age, Region, Language, time, ra, ag) %>%
  reframe(F1 = mean(F1), F2 = mean(F2)) %>%
  ungroup() #%>%
  #filter(gender == "Female")

ggplot(aim, aes(x = F2, y = F1, color = Language, fill = Language, shape = ra)) + 
  geom_point(aes(shape = ra, fill = Language), size = 4) + 
  geom_line(aes(linetype = Language)) +
  #geom_smooth(aes(linetype = lan, fill = lan)) +
  scale_shape_manual(labels = c("North Old", "North Young", "South Old", "South Young"),
                     values = c(1, 16, 2, 17)) +  # Specify shapes (filled and open)
  scale_linetype_manual(labels = c("Mandarin", "Min"),
                        values = c("dashed", "solid")) +
  # scale_fill_manual(labels = c("Mandarin", "Min"),
  #                   values = c("blue", NA)) +
  theme_minimal() +
  scale_x_reverse() + 
  scale_y_reverse() + 
  facet_grid(~gender) +
  labs(x = "Normalized F2", y = "Normalized F1") +
  theme_bw() +
  theme(plot.title = element_text(size = 18, hjust = 0.5, vjust = 0.5),
        axis.text.x = element_text(size = 18),
        strip.text.x = element_text(size = 18),
        strip.text.y = element_text(size = 18),
        legend.title = element_text(size = 12),
        legend.text = element_text(size = 12),
        axis.title = element_text(size = 18),
        panel.background=element_rect(colour="black", size = 1),
        strip.background = element_rect(color = "black", size = 0.8),
        #legend.position = c(0.93, 0.4),
        legend.background = element_rect(fill="white",
                                  size=0.3, linetype="solid", 
                                  colour ="black"),
        axis.text.y = element_text(size = 16)) +
  theme(legend.title=element_blank()) +
  ggtitle("Lobanov-transformed vowel plot")

#ggsave("f2_lan.png", dpi = 600, width = 8, height = 5)

age plot

ai_read_age <- ai_read1 %>%
  mutate(ag = paste0(area, age, gender)) %>%
  mutate(age = ifelse(age == "O", "Old", "Young")) %>%
  mutate(area = ifelse(area == "N", "North", "South")) %>%
  mutate(gender = ifelse(gender == "F", "Female", "Male")) %>%
  mutate(lan = ifelse(lan == "TW", "Min", "Mandarin")) %>%
  mutate(group = paste(area, gender)) %>%
  select(-time) %>%
  mutate(f = ifelse(f == 1, "F1", "F2")) %>%
  filter(formant != is.na(formant)) %>%
  spread(f, formant) %>%
  group_by(Filename, t) %>%
  filter(F1 %in% F1[abs(scale(F1)) <= 3], F2 %in% F2[abs(scale(F2)) <= 3]) %>%
  ungroup() %>%
  filter(t != 0) %>%
  filter(t != 9) %>%
  group_by(Filename, age, gender, area, ag) %>%
  mutate(F1 = scale(F1), F2 = scale(F2)) %>%
  ungroup() #%>%
  #filter(lan != "Min") %>%
  #filter(gender != "Female")
  # group_by(age, gender, area, ag, t) %>%
  # summarise(mean_F1 = mean(F1), mean_F2 = mean(F2))

age1 <- ggplot(ai_read_age, aes(x = F2, y = F1, color = age)) + 
  #geom_point(aes(shape = age),alpha = 0.5) + 
  geom_smooth(aes(linetype = age, fill = age)) +
  #stat_ellipse() + 
  theme_minimal() +
  scale_x_reverse() + 
  scale_y_reverse() + 
  #facet_grid(~gender)+
  scale_color_manual(labels = c("Old", "Young"),
                    values = c("#02c39a", "#FF3C38")) +
  scale_fill_manual(labels = c("Old", "Young"),
                    values = c("#02c39a", "#FF3C38")) +
  theme_bw() +
  labs(linetype = "Age", color = "Age", fill = "Age"
       ) +
  theme(plot.title = element_text(size = 18, hjust = 0.5, vjust = 0.5),
        axis.text.x = element_text(size = 18),
        strip.text.x = element_text(size = 18),
        strip.text.y = element_text(size = 18),
        legend.title = element_text(size = 12),
        legend.text = element_text(size = 12),
        axis.title = element_text(size = 18),
        panel.background=element_rect(colour="black", size = 1),
        strip.background = element_rect(color = "black", size = 0.8),
        #legend.position = c(0.93, 0.4),
        legend.background = element_rect(fill="white",
                                  size=0.3, linetype="solid", 
                                  colour ="black"),
        axis.text.y = element_text(size = 16)) +
  scale_shape_manual(values = 1:10) #+ 
  #ggtitle("Lobanov-transformed vowel plot")

#ggsave("f2_age.png", dpi = 600, width = 8, height = 5)
age1
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

proficiency

aip <- ai1 %>%
  #mutate(pro = ifelse(proficiency >= mean(proficiency), "High", "Low")) %>%
  #mutate(pro = ifelse(proficiency >= 0.6, "High", "Low")) %>%
  mutate(pro = ifelse(TW >= 5, "High", "Low")) %>%
  group_by(gender, age, Region, Language, time, ra, ag, pro) %>%
  reframe(F1 = mean(F1), F2 = mean(F2)) %>%
  ungroup() #%>%
  #filter(gender == "Female")

ggplot(aip, aes(x = F2, y = F1, color = Language, fill = Language, shape = ra)) + 
  geom_point(aes(shape = ra, fill = Language), size = 4) + 
  geom_line(aes(linetype = Language)) +
  #geom_smooth(aes(linetype = lan, fill = lan)) +
  scale_shape_manual(labels = c("North Old", "North Young", "South Old", "South Young"),
                     values = c(1, 16, 2, 17)) +  # Specify shapes (filled and open)
  scale_linetype_manual(labels = c("Mandarin", "Min"),
                        values = c("dashed", "solid")) +
  # scale_fill_manual(labels = c("Mandarin", "Min"),
  #                   values = c("blue", NA)) +
  theme_minimal() +
  scale_x_reverse() + 
  scale_y_reverse() + 
  facet_grid(~pro + gender) +
  labs(x = "Normalized F2", y = "Normalized F1") +
  theme_bw() +
  theme(plot.title = element_text(size = 18, hjust = 0.5, vjust = 0.5),
        axis.text.x = element_text(size = 18),
        strip.text.x = element_text(size = 18),
        strip.text.y = element_text(size = 18),
        legend.title = element_text(size = 12),
        legend.text = element_text(size = 12),
        axis.title = element_text(size = 18),
        panel.background=element_rect(colour="black", size = 1),
        strip.background = element_rect(color = "black", size = 0.8),
        #legend.position = c(0.93, 0.4),
        legend.background = element_rect(fill="white",
                                  size=0.3, linetype="solid", 
                                  colour ="black"),
        axis.text.y = element_text(size = 16)) +
  theme(legend.title=element_blank()) +
  ggtitle("Lobanov-transformed vowel plot")

#ggsave("f2_lan.png", dpi = 600, width = 8, height = 5)

###GY and TW statistics F1

aif1 <- ai1 %>%
  #ai_read1 %>%
  #mutate(ag = paste0(area, age, gender)) %>%
  # mutate(age = ifelse(age == "O", "Old", "Young")) %>%
  # mutate(area = ifelse(area == "N", "North", "South")) %>%
  # mutate(gender = ifelse(gender == "F", "Female", "Male")) %>%
  # mutate(lan = ifelse(lan == "TW", "Min", "Mandarin")) %>%
  # # mutate(group = as.factor(group)) %>%
  #filter(gender == "Female") %>%
  #filter(age == "Old") %>%
  #filter(Region == "North") %>%
  filter(Language != "Min") %>%
  filter(time == 2) %>%
  mutate(pro = ifelse(TW >= 5, "High", "Low")) %>%
  mutate(Language = as.factor(Language)) %>%
  mutate(Filename = as.factor(Filename)) %>%
  mutate(context = as.factor(context)) %>%
  mutate(age = as.factor(age)) %>%
  mutate(Region = as.factor(Region)) %>%
  mutate(gender = as.factor(gender)) 

#model_f <- lmer(F2~area*t + (1 | Filename), data = aif1) 
model_f <- lmer(F2~pro*gender*age*Region + (1 | Filename), data = aif1) 
## fixed-effect model matrix is rank deficient so dropping 2 columns / coefficients
## boundary (singular) fit: see help('isSingular')
#model_f <- lmer(formant~area*t + (1 | Filename), data = aif1) 

coefs <- data.frame(coef(summary(model_f)))
coefs <- coefs %>%
  mutate(Estimate = round(Estimate, 4)) %>%
  mutate(Std..Error = round(Std..Error, 4)) %>%
  mutate(t.value = round(t.value, 4)) %>%
  mutate(p.z = round(2*(1-pnorm(abs(coefs$t.value))), 4)) %>%
  mutate(p.z = ifelse(2*(1-pnorm(abs(coefs$t.value))) < 0.001, "<.001",
                      ifelse(2*(1-pnorm(abs(coefs$t.value))) < 0.01, "<.01",
                             ifelse(2*(1-pnorm(abs(coefs$t.value))) < 0.05, "<.05",
                                    round(2*(1-pnorm(abs(coefs$t.value))), 4)))))
coefs
##                               Estimate Std..Error t.value    p.z
## (Intercept)                     0.3730     0.1012  3.6860  <.001
## proLow                          0.4986     0.2262  2.2039   <.05
## genderMale                      0.0816     0.1440  0.5665  0.571
## ageYoung                        0.7493     0.2830  2.6473   <.01
## RegionSouth                     0.2599     0.1357  1.9143 0.0556
## proLow:genderMale              -0.4294     0.3204 -1.3403 0.1802
## proLow:ageYoung                -0.8568     0.3595 -2.3832   <.05
## genderMale:ageYoung            -0.5627     0.2437 -2.3091   <.05
## proLow:RegionSouth             -0.3634     0.4204 -0.8643 0.3874
## genderMale:RegionSouth         -0.0540     0.1979 -0.2729 0.7849
## ageYoung:RegionSouth           -0.6483     0.2268 -2.8584   <.01
## proLow:genderMale:ageYoung      0.4440     0.4364  1.0174 0.3089
## proLow:genderMale:RegionSouth   0.3980     0.3400  1.1704 0.2418
## proLow:ageYoung:RegionSouth     0.4934     0.4335  1.1382  0.255
#summary(model_f)
#write.csv(coefs, "table10.csv")

gender plot

ai_read_gender <- ai_read1 %>%
  mutate(ag = paste0(area, age, gender)) %>%
  mutate(age = ifelse(age == "O", "Old", "Young")) %>%
  mutate(area = ifelse(area == "N", "North", "South")) %>%
  mutate(gender = ifelse(gender == "F", "Female", "Male")) %>%
  mutate(lan = ifelse(lan == "TW", "Min", "Mandarin")) %>%
  mutate(group = paste(area, age)) %>%
  select(-time) %>%
  mutate(f = ifelse(f == 1, "F1", "F2")) %>%
  filter(formant != is.na(formant)) %>%
  spread(f, formant) %>%
  group_by(Filename, t) %>%
  filter(F1 %in% F1[abs(scale(F1)) <= 3], F2 %in% F2[abs(scale(F2)) <= 3]) %>%
  ungroup() %>%
  filter(t != 0) %>%
  group_by(Filename, age, gender, area, ag) %>%
  mutate(F1 = scale(F1), F2 = scale(F2)) %>%
  ungroup() 
  # group_by(age, gender, area, ag, t) %>%
  # summarise(mean_F1 = mean(F1), mean_F2 = mean(F2))

ggplot(ai_read_gender, aes(x = F2, y = F1, color = gender)) + 
  #geom_point(aes(shape = age),alpha = 0.5) + 
  geom_smooth(aes(linetype = gender, fill = gender)) +
  #stat_ellipse() + 
  theme_minimal() +
  scale_x_reverse() + 
  scale_y_reverse() + 
  #facet_grid(lan~group)+
  scale_color_manual(labels = c("Female", "Male"),
                    values = c("#9d4edd", "#329cff")) +
  scale_fill_manual(labels = c("Female", "Male"),
                    values = c("#9d4edd", "#329cff")) +
  theme_bw() +
  theme(plot.title = element_text(size = 18, hjust = 0.5, vjust = 0.5),
        axis.text.x = element_text(size = 18),
        strip.text.x = element_text(size = 18),
        strip.text.y = element_text(size = 18),
        legend.title = element_text(size = 12),
        legend.text = element_text(size = 12),
        axis.title = element_text(size = 18),
        panel.background=element_rect(colour="black", size = 1),
        strip.background = element_rect(color = "black", size = 0.8),
        #legend.position = c(0.93, 0.4),
        legend.background = element_rect(fill="white",
                                  size=0.3, linetype="solid", 
                                  colour ="black"),
        axis.text.y = element_text(size = 16)) +
  scale_shape_manual(values = 1:10) + 
  ggtitle("Lobanov-transformed vowel plot")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

#ggsave("f2_gender.png", dpi = 600, width = 10, height = 6)

old proficiency

ai_readm <- ai_read1 %>%
  filter(lan == "TW") %>%
  filter(gender == "M") %>%
  mutate(lans = "Male TSM") %>%
  mutate(pro = ifelse(proficiency >= mean(proficiency), "High", "Low")) %>%
  group_by(Filename) %>%
  mutate(m = mean(formant), F1 = scale(formant)) %>%
  filter(f == "1")

ai_readf <- ai_read1 %>%
  filter(lan == "GY") %>%
  filter(gender == "F") %>%
  mutate(lans = "Female TM") %>%
  mutate(pro = ifelse(proficiency >= mean(proficiency), "High", "Low")) %>%
  group_by(Filename) %>%
  mutate(m = mean(formant), F1 = scale(formant)) %>%
  filter(f == "1")

f111 <- bind_rows(ai_readm, ai_readf)

ai_readff <- ai_read1 %>%
  filter(lan == "GY") %>%
  filter(gender == "F") %>%
  mutate(lans = "Male TSM") %>%
  mutate(pro = ifelse(proficiency >= mean(proficiency), "High", "Low")) %>%
  group_by(Filename) %>%
  mutate(m = mean(formant), F2 = scale(formant)) %>%
  filter(f == "2") 

ai_readmm <- ai_read1 %>%
  filter(lan == "TW") %>%
  filter(gender == "M") %>%
  mutate(lans = "Female TM") %>%
  mutate(pro = ifelse(proficiency >= mean(proficiency), "High", "Low")) %>%
  group_by(Filename) %>%
  mutate(m = mean(formant), F2 = scale(formant)) %>%
  filter(f == "2") 

f222 <- bind_rows(ai_readmm, ai_readff)

ai_read_fp <- ggplot() +
  geom_smooth(data = f111, aes(x = t, y = F1, group = pro, fill = pro, linetype = pro, color = pro), size = 2, se = T) +
  geom_smooth(data = f222, aes(x = t, y = F2, group = pro, fill = pro, linetype = pro, color = pro), size = 2, se = T) +
  labs(x = "Time", y = "z-score", linetype = "Proficiency", 
       fill = "Proficiency", color = "Proficiency") +
  scale_color_manual(labels = c("High", "Low"),
                    values = c("#F78914", "#B6EE56")) +
  scale_fill_manual(labels = c("High", "Low"),
                    values = c("#F78914", "#B6EE56")) +
  theme_bw() +
  facet_grid(~lans) +
  theme(plot.title = element_text(size = 18, hjust = 0.5, vjust = 0.5),
        axis.text.x = element_text(size = 18),
        strip.text.x = element_text(size = 18),
        legend.title = element_text(size = 18),
        legend.text = element_text(size = 14),
        axis.title = element_text(size = 18),
        axis.text.y = element_text(size = 16))

ai_read_fp
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 2 rows containing non-finite values (`stat_smooth()`).
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 2 rows containing non-finite values (`stat_smooth()`).

#ggsave("read_pro.png", dpi = 600, width = 8, height = 5)

stats

aif1 <- ai_read1 %>%
  filter(gender == "F") %>%
  mutate(lan = as.factor(lan)) %>%
  filter(age == "O") %>%
  filter(area == "S") %>%
  filter(lan == "TW") %>%
  mutate(t = as.numeric(t)) %>%
  filter(f == "1")

model_f <- lmer(formant~proficiency*t + (1 | Filename), data = aif1) 
coefs <- data.frame(coef(summary(model_f)))
coefs <- coefs %>%
  mutate(Estimate = round(Estimate, 4)) %>%
  mutate(Std..Error = round(Std..Error, 4)) %>%
  mutate(t.value = round(t.value, 4)) %>%
  mutate(p.z = round(2*(1-pnorm(abs(coefs$t.value))), 4)) %>%
  mutate(p.z = ifelse(2*(1-pnorm(abs(coefs$t.value))) < 0.001, "***",
                      ifelse(2*(1-pnorm(abs(coefs$t.value))) < 0.01, "**",
                             ifelse(2*(1-pnorm(abs(coefs$t.value))) < 0.05, "*",
                                    round(2*(1-pnorm(abs(coefs$t.value))), 4)))))
coefs
##               Estimate Std..Error t.value    p.z
## (Intercept)   919.2616   222.2079  4.1369    ***
## proficiency   -21.1037   225.1560 -0.0937 0.9253
## t             -14.6667    15.4160 -0.9514 0.3414
## proficiency:t -21.3905    15.6107 -1.3702 0.1706